#!/usr/bin/env python3 """Sales Prospector v11 - Bing scraper (more reliable than Google)""" import json, re, time, random, urllib.request, urllib.parse from datetime import datetime from pathlib import Path import ssl ssl._create_default_https_context = ssl._create_unverified_context SCRIPT_DIR = Path(__file__).parent STATE_DIR, LOG_DIR, LEADS_DIR = SCRIPT_DIR / "state", SCRIPT_DIR / "logs", SCRIPT_DIR / "leads" for d in [STATE_DIR, LOG_DIR, LEADS_DIR]: d.mkdir(parents=True, exist_ok=True) STATE_FILE = STATE_DIR / "prospector-v11-state.json" LOG_FILE = LOG_DIR / f"prospector-v11-{datetime.now().strftime('%Y%m%d')}.log" METROS = ["Charlotte NC", "Atlanta GA", "Orlando FL", "Phoenix AZ", "Austin TX", "Denver CO", "Nashville TN", "Raleigh NC", "Tampa FL", "Dallas TX", "Houston TX", "Miami FL"] TWENTY_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzczMzI4NDQzLCJleHAiOjE4MDQ3ODE2NDIsImp0aSI6IjIwZjEyYzkwLTRkMDctNGJmNi1iMzk3LTZjNmU3MzlmMThjOCJ9.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930" TWENTY_BASE = "https://salesforce.hoaledgeriq.com/rest" LAST_REQ, BLOCKED_UNTIL = 0, 0 USER_AGENTS = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"] def log(msg): ts = datetime.now().strftime('%H:%M:%S') line = f"[{ts}] {msg}" print(line) with open(LOG_FILE, 'a') as f: f.write(line + '\n') def throttle(): global LAST_REQ delay = random.uniform(5, 10) # 5-10s between requests if LAST_REQ > 0: elapsed = time.time() - LAST_REQ if elapsed < delay: time.sleep(delay - elapsed) LAST_REQ = time.time() def load_state(): if STATE_FILE.exists(): s = json.loads(STATE_FILE.read_text()) s['crm'] = set(s.get('crm', [])) return s return {"metro_idx": 0, "crm": set(), "leads": 0, "cycle": 0} def save_state(s): tmp = s.copy() tmp['crm'] = list(s['crm']) STATE_FILE.write_text(json.dumps(tmp, indent=2)) def search_bing(query): global BLOCKED_UNTIL if time.time() < BLOCKED_UNTIL: return [] throttle() log(f"SEARCH: {query}") try: # Bing search URL url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}&format=rss" req = urllib.request.Request(url, headers={ "User-Agent": random.choice(USER_AGENTS), "Accept": "text/html,application/rss+xml", "Accept-Language": "en-US,en;q=0.9" }) with urllib.request.urlopen(req, timeout=15) as r: html = r.read().decode('utf-8', errors='ignore') if any(x in html.lower() for x in ['captcha', 'blocked', 'unusual']): BLOCKED_UNTIL = time.time() + 1800 log(f" -> BLOCKED, pausing 30min") return [] # Extract result URLs urls = [] # Pattern for organic results for m in re.findall(r'
  • ]*>.*? {len(urls)} URLs") return urls[:10] except urllib.error.HTTPError as e: if e.code in [429, 503]: BLOCKED_UNTIL = time.time() + 900 log(f" -> Rate limited ({e.code}), backoff 15min") return [] except Exception as e: log(f" -> Error: {str(e)[:40]}") return [] def fetch_page(url): throttle() try: req = urllib.request.Request(url, headers={"User-Agent": random.choice(USER_AGENTS)}) with urllib.request.urlopen(req, timeout=10) as r: html = r.read().decode('utf-8', errors='ignore') text = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.I) text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.I) text = re.sub(r'<[^>]+>', ' ', text) return re.sub(r'\s+', ' ', text)[:2000] except: return "" def get_domain(url): try: d = urllib.parse.urlparse(url).netloc.lower() return d[4:] if d.startswith('www.') else d except: return None def is_hoa(d): if not d: return False dl = d.lower() good = ['hoa', 'homeowners', 'association', 'community', 'condo', 'village', 'creek', 'estates', 'mgmt', 'management'] bad = ['bing.com', 'microsoft.com', 'facebook.com', 'yelp.com', 'bbb.org'] return any(k in dl for k in good) and not any(b in dl for b in bad) def get_emails(t): if not t: return [] ems = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', t) bad = ['example.com', 'test.com', 'noreply@'] return list(set([e.lower() for e in ems if len(e) > 8 and '@' in e and not any(b in e for b in bad)]))[:3] def push_crm(lead): try: note = {"title": f"{lead['q']}: {lead['d']}", "bodyV2": {"markdown": f"## {lead['q']} Lead\n\n**HOA:** {lead['n']}\n**Metro:** {lead['m']}\n**Site:** {lead['u']}\n**Emails:** {', '.join(lead['e']) or 'None'}"}} urllib.request.urlopen(urllib.request.Request(f"{TWENTY_BASE}/notes", headers={"Authorization": f"Bearer {TWENTY_TOKEN}", "Content-Type": "application/json"}, data=json.dumps(note).encode(), method='POST'), timeout=10) log(f"CRM: {lead['d']}") return True except: return False def main(): global BLOCKED_UNTIL log("=== Prospector v11 Started (Bing scraper) ===") s = load_state() queries = ["{m} HOA", "{m} homeowners association", "{m} HOA management"] while True: s['cycle'] += 1 metro = METROS[s['metro_idx'] % len(METROS)] status = "(blocked)" if time.time() < BLOCKED_UNTIL else "" log(f"CYCLE {s['cycle']}: {metro} {status}") if time.time() < BLOCKED_UNTIL: time.sleep(60) continue start, found = time.time(), 0 for qtmpl in queries: if s['leads'] >= 50: break urls = search_bing(qtmpl.format(m=metro)) for url in urls[:6]: if s['leads'] >= 50: break dom = get_domain(url) if not dom or dom in s['crm'] or not is_hoa(dom): continue text = fetch_page(url) emails = get_emails(text) lead = {'n': dom.split('.')[0].replace('-', ' ').title() + " HOA", 'm': metro, 'u': url, 'd': dom, 'e': emails, 'q': "HOT" if len(emails) >= 2 else "WARM" if emails else "COLD"} if push_crm(lead): s['crm'].add(dom) s['leads'] += 1 found += 1 log(f"LEAD {s['leads']}: {lead['n']} ({lead['q']})") s['metro_idx'] = (s['metro_idx'] + 1) % len(METROS) save_state(s) log(f"Done: {found} new, {s['