#!/usr/bin/env python3 """Sales Prospector v14 - 50 metros + suburbs""" import json, re, time, random, urllib.request, urllib.parse from datetime import datetime from pathlib import Path import ssl ssl._create_default_https_context = ssl._create_unverified_context for d in [Path(__file__).parent / x for x in ["state", "logs", "leads"]]: d.mkdir(parents=True, exist_ok=True) STATE_FILE = Path(__file__).parent / "state" / "prospector-v14-state.json" LOG_FILE = Path(__file__).parent / "logs" / f"prospector-v14-{datetime.now().strftime('%Y%m%d')}.log" # TOP 50 METROS + surrounding cities METROS = [ # Top 20 major metros ("New York NY", ["Manhattan", "Brooklyn", "Queens", "Bronx", "Staten Island", "Jersey City", "Newark"]), ("Los Angeles CA", ["Santa Monica", "Pasadena", "Burbank", "Glendale", "Long Beach", "Anaheim"]), ("Chicago IL", ["Evanston", "Oak Park", "Naperville", "Schaumburg", "Skokie"]), ("Houston TX", ["Sugar Land", "The Woodlands", "Katy", "Pearland", "Baytown"]), ("Phoenix AZ", ["Scottsdale", "Tempe", "Mesa", "Chandler", "Glendale"]), ("Philadelphia PA", ["Camden", "Chester", "Upper Darby"]), ("San Antonio TX", ["New Braunfels", "Schertz", "Cibolo"]), ("San Diego CA", ["Chula Vista", "Oceanside", "Escondido", "Carlsbad"]), ("Dallas TX", ["Fort Worth", "Arlington", "Plano", "Irving", "Frisco", "McKinney"]), ("San Jose CA", ["Sunnyvale", "Santa Clara", "Mountain View", "Palo Alto"]), ("Austin TX", ["Round Rock", "Cedar Park", "Georgetown", "Pflugerville"]), ("Jacksonville FL", ["Orange Park", "St. Augustine", "Ponte Vedra"]), ("Columbus OH", ["Dublin", "Westerville", "Gahanna", "Reynoldsburg"]), ("Charlotte NC", ["Matthews", "Mint Hill", "Huntersville", "Concord", "Gastonia"]), ("Indianapolis IN", ["Carmel", "Fishers", "Noblesville", "Greenwood"]), ("San Francisco CA", ["Oakland", "Berkeley", "Richmond", "Walnut Creek"]), ("Seattle WA", ["Bellevue", "Redmond", "Tacoma", "Kirkland", "Renton"]), ("Denver CO", ["Aurora", "Lakewood", "Thornton", "Westminster", "Boulder"]), ("Oklahoma City OK", ["Edmond", "Norman", "Moore", "Midwest City"]), ("Boston MA", ["Cambridge", "Somerville", "Brookline", "Newton"]), # Next 30 metros ("Portland OR", ["Beaverton", "Gresham", "Hillsboro", "Lake Oswego"]), ("Las Vegas NV", ["Henderson", "North Las Vegas", "Summerlin"]), ("Nashville TN", ["Franklin", "Brentwood", "Hendersonville", "Murfreesboro"]), ("Detroit MI", ["Warren", "Sterling Heights", "Dearborn", "Livonia"]), ("Oklahoma City OK", ["Edmond", "Norman", "Moore"]), ("Memphis TN", ["Germantown", "Collierville", "Bartlett"]), ("Louisville KY", ["Jeffersonville", "New Albany", "Elizabethtown"]), ("Milwaukee WI", ["Waukesha", "West Allis", "Wauwatosa"]), ("Baltimore MD", ["Columbia", "Ellicott City", "Towson"]), ("Albuquerque NM", ["Rio Rancho", "Santa Fe", "Los Lunas"]), ("Tucson AZ", ["Marana", "Oro Valley", "Sahuarita"]), ("Mesa AZ", ["Gilbert", "Chandler", "Tempe"]), ("Fresno CA", ["Clovis", "Madera", "Sanger"]), ("Atlanta GA", ["Sandy Springs", "Roswell", "Johns Creek", "Alpharetta", "Marietta"]), ("Sacramento CA", ["Elk Grove", "Roseville", "Folsom", "Davis"]), ("Kansas City MO", ["Overland Park", "Olathe", "Independence", "Leawood"]), ("Colorado Springs CO", ["Fountain", "Monument", "Woodland Park"]), ("Raleigh NC", ["Cary", "Apex", "Holly Springs", "Wake Forest"]), ("Omaha NE", ["Bellevue", "Papillion", "La Vista"]), ("Miami FL", ["Miami Beach", "Coral Gables", "Hialeah", "Fort Lauderdale"]), ("Long Beach CA", ["Lakewood", "Signal Hill"]), ("Virginia Beach VA", ["Norfolk", "Chesapeake", "Newport News", "Hampton"]), ("Oakland CA", ["Berkeley", "Alameda", "San Leandro"]), ("Minneapolis MN", ["St. Paul", "Bloomington", "Plymouth", "Edina"]), ("Tulsa OK", ["Broken Arrow", "Bixby", "Jenks"]), ("Tampa FL", ["St. Petersburg", "Clearwater", "Brandon", "Lutz"]), ("Arlington TX", ["Grand Prairie", "Euless", "Bedford"]), ("Wichita KS", ["Overland Park", "Lenexa", "Shawnee"]), ("Bakersfield CA", ["Delano", "Oildale", "Rosedale"]), ("Aurora CO", ["Centennial", "Parker", "Englewood"]), ("Anaheim CA", ["Fullerton", "Orange", "Garden Grove", "Brea"]), ("Santa Ana CA", ["Irvine", "Costa Mesa", "Tustin", "Newport Beach"]), ("Corpus Christi TX", ["Portland", "Kingsville", "Alice"]), ("Riverside CA", ["Moreno Valley", "Corona", "Jurupa Valley", "Norco"]), ("Lexington KY", ["Georgetown", "Richmond", "Winchester"]), ("Stockton CA", ["Lodi", "Tracy", "Manteca"]) ] SEARXNG = "https://search.sensetostyle.com" TWENTY_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzczMzI4NDQzLCJleHAiOjE4MDQ3ODE2NDIsImp0aSI6IjIwZjEyYzkwLTRkMDctNGJmNi1iMzk3LTZjNmU3MzlmMThjOCJ9.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930" TWENTY_BASE = "https://salesforce.hoaledgeriq.com/rest" LAST_REQ = 0 def log(m): ts = datetime.now().strftime('%H:%M:%S') print(f"[{ts}] {m}") with open(LOG_FILE, 'a') as f: f.write(f"[{ts}] {m}\n") def throttle(): global LAST_REQ dly = random.uniform(3, 6) if LAST_REQ > 0 and (time.time() - LAST_REQ) < dly: time.sleep(dly - (time.time() - LAST_REQ)) LAST_REQ = time.time() def load(): if STATE_FILE.exists(): s = json.loads(STATE_FILE.read_text()) s['crm'] = set(s.get('crm', [])) return s return {"m": 0, "crm": set(), "leads": 0, "cycle": 0} def save(s): tmp = s.copy() tmp['crm'] = list(s['crm']) STATE_FILE.write_text(json.dumps(tmp, indent=2)) def search(q): throttle() try: url = f"{SEARXNG}/search?q={urllib.parse.quote(q)}" with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}), timeout=15) as r: html = r.read().decode('utf-8', errors='ignore') urls = [m for m in re.findall(r'href="(https?://[^"]+)"', html) if 'sensetostyle' not in m and 'archive.org' not in m] return list(dict.fromkeys(urls))[:12] except: return [] def get_dom(url): try: d = urllib.parse.urlparse(url).netloc.lower() return d[4:] if d.startswith('www.') else d except: return None def is_hoa(d): if not d: return False dl = d.lower() good = ['hoa', 'homeowners', 'association', 'community', 'condo', 'village', 'mgmt', 'management', 'properties', 'realty'] bad = ['sensetostyle', 'archive.org', 'google', 'facebook', 'yelp', 'bbb', 'wiki', 'reddit', 'linkedin', 'trulia', 'realtor', 'zillow'] return any(k in dl for k in good) and not any(b in dl for b in bad) def fetch(url): throttle() try: with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}), timeout=10) as r: t = re.sub(r'', '', r.read().decode('utf-8', errors='ignore'), flags=re.DOTALL|re.I) t = re.sub(r'', '', t, flags=re.DOTALL|re.I) return re.sub(r'\s+', ' ', resub(r'<[^>]+>', ' ', t))[:2000] except: return "" def get_emails(t): if not t: return [] ems = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', t) return list(set([e.lower() for e in ems if len(e) > 8 and '@' in e]))[:3] or [] def crm_push(lead): try: note = {"title": f"{lead['q']}: {lead['d']}", "bodyV2": {"markdown": f"## {lead['q']} Lead\n\n**HOA:** {lead['n']}\n**Metro:** {lead['m']}\n**City:** {lead['c']}\n**Site:** {lead['u']}\n**Emails:** {', '.join(lead['e']) or 'None'}"}} urllib.request.urlopen(urllib.request.Request(f"{TWENTY_BASE}/notes", headers={"Authorization": f"Bearer {TWENTY_TOKEN}", "Content-Type": "application/json"}, data=json.dumps(note).encode(), method='POST'), timeout=10) log(f"CRM: {lead['d']}") return True except: return False def main(): log("=== v14 STARTED - 50 Metros + Suburbs ===") s = load() queries = ["{loc} HOA", "{loc} homeowners association", "{loc} HOA management"] while True: s['cycle'] += 1 metro_pack = METROS[s['m'] % len(METROS)] metro_name = metro_pack[0] suburbs = metro_pack[1] # Search metro + each suburb search_locations = [metro_name] + [f"{sub} {metro_name.split()[-1]}" for sub in suburbs[:3]] log(f"CYCLE {s['cycle']}: {metro_name} (+{len(suburbs)} suburbs) | Leads: {s['leads']}") new = 0 for city in search_locations[:4]: # metro + 3 suburbs if s['leads'] >= 750: break for qt in queries: if s['leads'] >= 750: break urls = search(qt.format(loc=city)) if urls: log(f" | {city}: {len(urls)} URLs") for url in urls[:4]: if s['leads'] >= 750: break dom = get_dom(url) if not dom or dom in s['crm'] or not is_hoa(dom): continue txt = fetch(url) lead = {'n': dom.split('.')[0].replace('-', ' ').title()[:30], 'm': metro_name, 'c': city, 'u': url, 'd': dom, 'e': get_emails(txt), 'q': "HOT" if len(get_emails(txt)) >= 2 else "WARM" if get_emails(txt) else "COLD"} if crm_push(lead): s['crm'].add(dom) s['leads'] += 1 new += 1 log(f"LEAD {s['leads']}: {lead['n']}") s['m'] = (s['m'] + 1) % len(METROS) save(s) log(f"Done: {new} new | {s['leads']} total") if s['leads'] >= 750: log("TARGET 750!"); break if new == 0: time.sleep(20) if __name__ == "__main__": main()