' in html and '

#!/usr/bin/env python3 """Junior AE v2 - Browser-like website validation""" import json, re, time, urllib.request, urllib.error from datetime import datetime, timedelta from pathlib import Path import ssl ssl._create_default_https_context = ssl._create_unverified_context SCRIPT_DIR = Path(__file__).parent for d in [SCRIPT_DIR / "state", SCRIPT_DIR / "logs"]: d.mkdir(parents=True, exist_ok=True) STATE_FILE = SCRIPT_DIR / "state" / "jae-v2-state.json" LOG_FILE = SCRIPT_DIR / "logs" / f"jae-v2-{datetime.now().strftime('%Y%m%d')}.log" CRM_URL = "https://salesforce.hoaledgeriq.com/rest" CRM_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzczMzI4NDQzLCJleHAiOjE4MDQ3ODE2NDIsImp0aSI6IjIwZjEyYzkwLTRkMDctNGJmNi1iMzk3LTZjNmU3MzlmMThjOCJ9.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930" def log(msg): ts = datetime.now().strftime('%H:%M:%S') print(f"[{ts}] {msg}") with open(LOG_FILE, 'a') as f: f.write(f"[{ts}] {msg}\n") def load_state(): if STATE_FILE.exists(): return json.loads(STATE_FILE.read_text()) return {"last_check": (datetime.now() - timedelta(hours=2)).isoformat(), "processed": 0, "upgraded": 0} def save_state(s): STATE_FILE.write_text(json.dumps(s, indent=2)) def fetch_notes(): try: req = urllib.request.Request( f"{CRM_URL}/notes?limit=50&order[createdAt]=desc", headers={"Authorization": f"Bearer {CRM_TOKEN}", "Accept": "application/json"} ) with urllib.request.urlopen(req, timeout=15) as r: return json.loads(r.read().decode()).get('data', {}).get('notes', []) except Exception as e: log(f"Fetch error: {e}") return [] def get_temp(title): t = title.upper() if 'HOT' in t: return 'HOT' if 'WARM' in t: return 'WARM' if 'COLD' in t: return 'COLD' return None def extract_url(body): if not body: return None # Match **Site:** URL pattern m = re.search(r'Site:\s*(https?://[^\s\n<]+)', str(body)) if m: return m.group(1).strip() # Fallback - any HTTP URL m = re.search(r'(https?://[^\s\n<"]+)', str(body)) return m.group(1) if m else None def validate_website(url): """Browser-like validation - GET request, check for real website content""" if not url: return False, "no_url" if not url.startswith('http'): url = 'https://' + url try: req = urllib.request.Request( url, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "identity", "Connection": "keep-alive", } ) with urllib.request.urlopen(req, timeout=15, context=ssl._create_unverified_context()) as r: content = r.read() code = r.getcode() # Must return 200 if code != 200: return False, f"http_{code}" # Must have content > 500 bytes if len(content) < 500: return False, "too_small" # Parse HTML html = content.decode('utf-8', errors='ignore')[:3000].lower() # Check for real website markers has_title = '' in html and '' in html has_body = ']+>', '', html) has_real_content = len(text_content.strip()) > 100 if has_title and has_body and has_real_content: return True, "real_website" else: missing = [] if not has_title: missing.append("no_title") if not has_body: missing.append("no_body") if not has_real_content: missing.append("no_content") return False, ",".join(missing) except urllib.error.HTTPError as e: if e.code in [301, 302, 307, 308]: # Follow redirect new_url = e.headers.get('Location', '') if new_url and new_url != url: return validate_website(new_url) return False, f"http_{e.code}" except Exception as e: return False, str(e)[:40] def upgrade(temp): return {'COLD': 'WARM', 'WARM': 'HOT', 'HOT': 'HOT'}.get(temp, temp) def update_note(note_id, body, new_temp, status): try: new_body = body + f"\n\n**JAE Validated v2:** {datetime.now().strftime('%Y-%m-%d %H:%M')}\n" \ f"**New Temp:** {new_temp}\n**Status:** {status}" data = json.dumps({"bodyV2": {"markdown": new_body}}).encode() req = urllib.request.Request( f"{CRM_URL}/notes/{note_id}", headers={"Authorization": f"Bearer {CRM_TOKEN}", "Content-Type": "application/json"}, data=data, method='PUT' ) with urllib.request.urlopen(req, timeout=10) as r: return True except Exception as e: log(f"Update failed: {e}") return False def process(): s = load_state() log("=== JAE v2 Starting ===") notes = fetch_notes() log(f"Fetched {len(notes)} notes") for note in notes: body = note.get('bodyV2', {}).get('markdown', '') if '__JAE_Validated v2__' in body or '__JAE_Validated__' in body: continue title = note.get('title', '') note_id = note.get('id') temp = get_temp(title) if not temp: log(f"Skip: no temp in title: {title[:30]}") continue url = extract_url(body) if not url: log(f"Skip: no URL found in: {title[:30]}") continue log(f"Validating: {url[:50]}") is_valid, status = validate_website(url) if is_valid and temp != 'HOT': new_temp = upgrade(temp) log(f"UPGRADE: {title[:40]} {temp}->{new_temp}") if update_note(note_id, body, new_temp, status): s['upgraded'] += 1 s['processed'] += 1 else: log(f"Checked: {title[:40]} {temp} (valid={is_valid}, {status})") s['processed'] += 1 s['last_check'] = datetime.now().isoformat() save_state(s) log(f"=== Done: {s['processed']} processed, {s['upgraded']} upgraded ===") def main(): while True: process() log("Waiting 3 hours...") time.sleep(10800) if __name__ == "__main__": main()