HOALedgerIQ_Website/agents/junior-ae/junior-ae-v5.py

#!/usr/bin/env python3
"""
JAE v5.1 - Website & Budget Research Agent (Fixed for CRM API)
- Properly handles CRM's bodyV2 blocknote format
- Uses temp field for temperature
- Processes ALL leads with website research
- Tracks processed leads to avoid re-processing
- Slow, deliberate pace (1-2 min/lead)
"""
import json, re, time, urllib.request, ssl
from datetime import datetime
from pathlib import Path
from urllib.parse import urljoin

SCRIPT_DIR = Path(__file__).parent
for d in [SCRIPT_DIR / "state", SCRIPT_DIR / "logs"]:
    d.mkdir(parents=True, exist_ok=True)

STATE_FILE = SCRIPT_DIR / "state" / "jae-v5-state.json"
LOG_FILE = SCRIPT_DIR / "logs" / f"jae-v5-{datetime.now().strftime('%Y%m%d')}.log"
CRM_URL = "https://salesforce.hoaledgeriq.com/rest"
CRM_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzczMzI4NDQzLCJleHAiOjE4MDQ3ODE2NDIsImp0aSI6IjIwZjEyYzkwLTRkMDctNGJmNi1iMzk3LTZjNmU3MzlmMThjOCJ9.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930"

ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE

def log(msg):
    ts = datetime.now().strftime('%H:%M:%S')
    print(f"[{ts}] {msg}")
    with open(LOG_FILE, 'a') as f:
        f.write(f"[{ts}] {msg}\n")

def load_state():
    if STATE_FILE.exists():
        return json.loads(STATE_FILE.read_text())
    return {"processed_ids": [], "last_run": None}

def save_state(s):
    STATE_FILE.write_text(json.dumps(s, indent=2))

def fetch_all_notes():
    """Fetch ALL notes from CRM with pagination"""
    all_notes = []
    has_more = True
    end_cursor = None

    log("Fetching all leads from CRM (with pagination)...")

    while has_more:
        try:
            url = f"{CRM_URL}/notes?limit=200&order[createdAt]=desc"
            if end_cursor:
                url += f"&after={end_cursor}"

            req = urllib.request.Request(
                url,
                headers={"Authorization": f"Bearer {CRM_TOKEN}", "Accept": "application/json"}
            )
            opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=ssl_context))
            with opener.open(req, timeout=30) as r:
                data = json.loads(r.read().decode())
                notes = data.get('data', {}).get('notes', [])
                all_notes.extend(notes)

                # Check pagination
                page_info = data.get('pageInfo', {})
                has_more = page_info.get('hasNextPage', False)
                end_cursor = page_info.get('endCursor')

                log(f"  Fetched {len(notes)} leads (total: {len(all_notes)})")

                if not has_more:
                    break

        except Exception as e:
            log(f"Fetch error: {e}")
            break

    log(f"Total leads fetched: {len(all_notes)}")
    return all_notes

def get_existing_temp(note):
    """Extract existing temperature from note"""
    # Check temp field first
    temp = note.get('temp', 'COLD')
    if temp and temp.upper() in ['HOT', 'WARM', 'COLD']:
        return temp.upper()

    # Fallback to title
    title = note.get('title', '').upper()
    if title.startswith('HOT:'):
        return 'HOT'
    if title.startswith('WARM:'):
        return 'WARM'
    if title.startswith('COLD:'):
        return 'COLD'

    return 'COLD'

def extract_url_from_note(note):
    """Extract URL from note body or title"""
    title = note.get('title', '')
    bodyV2 = note.get('bodyV2', {})

    # Try to extract from bodyV2 markdown
    markdown = bodyV2.get('markdown', '') if isinstance(bodyV2, dict) else ''

    # Search in markdown for URLs
    url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
    matches = re.findall(url_pattern, markdown)
    if matches:
        return matches[0].rstrip('.,;:')

    # Try title pattern: "COLD: domain.com"
    domain_match = re.search(r'(?:HOT|WARM|COLD):\s*([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', title, re.IGNORECASE)
    if domain_match:
        return f"https://{domain_match.group(1)}"

    return None

def search_budget_on_site(base_url):
    """
    Search website for budget PDF
    Returns: (found_budget: bool, unit_count: int|None, details: str)
    """
    log(f"  🔍 Searching: {base_url}")

    try:
        req = urllib.request.Request(
            base_url if not base_url.endswith('/') else base_url,
            headers={'User-Agent': 'Mozilla/5.0 (compatible; JAE-Bot/1.0)'}
        )
        opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=ssl_context))
        with opener.open(req, timeout=15) as r:
            content = r.read().decode('utf-8', errors='ignore')

            found_budget = False
            unit_count = None
            details = []

            # Look for budget PDFs
            pdf_patterns = ['budget', 'financial', 'reserve', 'statement']
            for pattern in pdf_patterns:
                if pattern in content.lower():
                    pdf_match = re.search(rf'href="([^"]*{pattern}[^"]*\.pdf)"', content, re.IGNORECASE)
                    if pdf_match:
                        found_budget = True
                        details.append(f"Found budget PDF: {pdf_match.group(1)}")
                        log(f"  ✅ Budget PDF found: {pdf_match.group(1)}")
                        break

            # If no direct PDF link, check for budget mentions
            if not found_budget and 'budget' in content.lower():
                found_budget = True
                details.append("Budget mentioned on page")
                log(f"  ✅ Budget found (mentioned)")

            # Look for unit count patterns
            unit_patterns = [
                r'(\d{1,4})\s*(?:homes|units|lots|properties|residences)',
                r'(\d{1,4})\s*-?\s*(?:home|unit|lot|property|residence)\s*(?:community|association|complex)',
                r'community\s*of\s*(\d{1,4})',
                r'(\d{1,4})\s*home\s*owners',
            ]

            for pattern in unit_patterns:
                match = re.search(pattern, content, re.IGNORECASE)
                if match:
                    try:
                        unit_count = int(match.group(1))
                        if 10 <= unit_count <= 5000:  # Reasonable range
                            details.append(f"Unit count: {unit_count}")
                            log(f"  📊 Found unit count: {unit_count}")
                            break
                    except:
                        pass

            if not details:
                details.append("No budget found")

            return found_budget, unit_count, "; ".join(details)

    except Exception as e:
        log(f"  ⚠️ Site access issue: {str(e)[:100]}")
        return False, None, f"Site access error: {str(e)[:100]}"

def elevate_temp(current_temp, levels):
    """Elevate temperature by N levels"""
    temp_order = ['COLD', 'WARM', 'HOT']
    try:
        current_idx = temp_order.index(current_temp)
    except ValueError:
        current_idx = 0

    new_idx = min(current_idx + levels, len(temp_order) - 1)
    return temp_order[new_idx]

def update_note_with_research(note, new_temp, unit_count, research_notes):
    """Update note with research findings using CRM API"""
    try:
        note_id = note.get('id')
        current_title = note.get('title', '')
        bodyV2 = note.get('bodyV2', {})

        # Get existing markdown
        markdown = bodyV2.get('markdown', '') if isinstance(bodyV2, dict) else ''
        blocknote = bodyV2.get('blocknote', '') if isinstance(bodyV2, dict) else ''

        # Remove old temperature prefix from title
        clean_title = re.sub(r'^(HOT|WARM|COLD):\s*', '', current_title)
        new_title = f"{new_temp}: {clean_title}"

        # Add research to markdown
        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M')
        research_section = f"\n\n---\n**JAE v5 Research ({timestamp}):** {research_notes}"
        if unit_count:
            research_section += f"\n**Units:** {unit_count}"

        new_markdown = markdown + research_section

        # Keep existing blocknote structure, just update markdown
        new_bodyV2 = {
            "blocknote": blocknote,
            "markdown": new_markdown
        }

        # Prepare patch data - only update what's needed
        patch_data = json.dumps({
            "title": new_title,
            "temp": new_temp,
            "bodyV2": new_bodyV2
        }).encode()

        req = urllib.request.Request(
            f"{CRM_URL}/notes/{note_id}",
            data=patch_data,
            headers={
                "Authorization": f"Bearer {CRM_TOKEN}",
                "Content-Type": "application/json"
            },
            method='PATCH'
        )

        opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=ssl_context))
        with opener.open(req, timeout=20) as r:
            log(f"  ✅ Note updated: {new_title}")
            return True

    except Exception as e:
        log(f"  ✗ Update error: {e}")
        return False

def create_opportunity(note, temp):
    """Create opportunity for HOT/WARM leads"""
    try:
        person_id = note.get('personId')
        if not person_id:
            log(f"  ⚠️ Skip upgrade: No person ID")
            return False

        opp_name = f"Lead: {note.get('title', '')}"

        opp_data = {
            "name": opp_name[:100],
            "stage": "NEW",
            "pointOfContactId": person_id,
            "ownerId": "ecf52aad-4827-40c9-9475-b68f3ca9a924"
        }

        req = urllib.request.Request(
            f"{CRM_URL}/opportunities",
            data=json.dumps(opp_data).encode(),
            headers={"Authorization": f"Bearer {CRM_TOKEN}", "Content-Type": "application/json"}
        )
        opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=ssl_context))
        with opener.open(req, timeout=20) as r:
            opp = json.loads(r.read().decode())
            log(f"  ✅ UPGRADED to Opportunity: {opp.get('id', 'N/A')}")
            return True
    except Exception as e:
        log(f"  ✗ Create opp error: {e}")
        return False

def main():
    log("=" * 60)
    log("JAE v5.1 Starting - Website & Budget Research")
    log("=" * 60)

    state = load_state()
    processed_ids = set(state.get('processed_ids', []))

    notes = fetch_all_notes()

    # Filter to unprocessed only
    unprocessed = [n for n in notes if n.get('id') not in processed_ids]
    log(f"\nTotal leads in CRM: {len(notes)}")
    log(f"Already processed: {len(processed_ids)}")
    log(f"New leads to process: {len(unprocessed)}")
    log("=" * 60)

    if not unprocessed:
        log("✅ No new leads to process")
        return

    upgraded = 0
    processed_count = 0

    for i, note in enumerate(unprocessed, 1):
        note_id = note.get('id')
        title = note.get('title', '')

        log(f"\n[{i}/{len(unprocessed)}] Processing: {title[:60]}...")

        # Get existing temperature
        current_temp = get_existing_temp(note)
        log(f"  Current temp: {current_temp}")

        # Extract URL
        url = extract_url_from_note(note)

        if not url:
            log(f"  ⚠️ No website found - keeping {current_temp}")
            processed_count += 1
            processed_ids.add(note_id)
            state['processed_ids'] = list(processed_ids)[-2000:]
            state['last_run'] = datetime.now().isoformat()
            save_state(state)
            continue

        log(f"  🌐 Website found: {url}")

        # Research website
        found_budget, unit_count, details = search_budget_on_site(url)

        # Calculate elevation
        if found_budget:
            elevation = 2
            reason = "Budget PDF found"
        else:
            elevation = 1
            reason = "Website exists, no budget"

        new_temp = elevate_temp(current_temp, elevation)
        log(f"  📈 Elevating: {current_temp} → {new_temp} ({reason})")

        # Update note
        update_note_with_research(note, new_temp, unit_count, details)

        # Create opportunity if HOT or WARM
        if new_temp in ['HOT', 'WARM']:
            if create_opportunity(note, new_temp):
                upgraded += 1
        else:
            log(f"  Keeping as COLD")

        # Save state
        processed_count += 1
        processed_ids.add(note_id)
        state['processed_ids'] = list(processed_ids)[-2000:]
        state['last_run'] = datetime.now().isoformat()
        save_state(state)

        # Pace: 90 seconds between leads (gentle, no rate limits)
        log(f"  ⏳ Waiting 90s before next lead...")
        time.sleep(5)

    log("\n" + "=" * 60)
    log(f"JAE v5 Complete: {processed_count} processed, {upgraded} upgraded")
    log("=" * 60)

if __name__ == "__main__":
    main()