Initial commit: STR Optimization Manager MVP
Full-stack short-term rental management platform with: - React/Vite frontend with dark theme dashboard, performance, pricing, reservations, experiments, and settings pages - Fastify API server with auth, platform management, performance tracking, pricing, reservations, experiments, and weekly report endpoints - Playwright-based scraper service with Airbnb adapter (login with MFA, performance metrics, reservations, calendar pricing, price changes) - VRBO adapter scaffold and mock adapter for development - PostgreSQL with Drizzle ORM, migrations, and seed scripts - Job queue with worker for async scraping tasks - AES-256-GCM credential encryption for platform credentials - Session cookie persistence for scraper browser sessions - Docker Compose for PostgreSQL database Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
25
apps/scraper/package.json
Normal file
25
apps/scraper/package.json
Normal file
@@ -0,0 +1,25 @@
|
||||
{
|
||||
"name": "@str/scraper",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "tsx watch src/index.ts",
|
||||
"build": "tsc",
|
||||
"start": "node dist/index.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"@str/shared-types": "*",
|
||||
"fastify": "^5.2.0",
|
||||
"drizzle-orm": "^0.38.0",
|
||||
"postgres": "^3.4.0",
|
||||
"playwright": "^1.49.0",
|
||||
"dotenv": "^16.4.0",
|
||||
"zod": "^3.24.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^22.0.0",
|
||||
"tsx": "^4.19.0",
|
||||
"typescript": "^5.7.0"
|
||||
}
|
||||
}
|
||||
215
apps/scraper/src/adapters/airbnb/AirbnbAdapter.ts
Normal file
215
apps/scraper/src/adapters/airbnb/AirbnbAdapter.ts
Normal file
@@ -0,0 +1,215 @@
|
||||
import { chromium, type Browser, type BrowserContext, type Page } from 'playwright';
|
||||
import { PlatformAdapter } from '../base/PlatformAdapter.js';
|
||||
import {
|
||||
loginFlow,
|
||||
checkSessionFlow,
|
||||
scrapePerformanceFlow,
|
||||
scrapeReservationsFlow,
|
||||
scrapePricingFlow,
|
||||
applyPriceChangesFlow,
|
||||
} from './airbnb.flows.js';
|
||||
|
||||
const SESSION_DIR = process.env.AIRBNB_SESSION_DIR || './.airbnb-session';
|
||||
|
||||
function isHeadless(): boolean {
|
||||
return process.env.AIRBNB_HEADLESS !== 'false'; // evaluated at call time, not import time
|
||||
}
|
||||
|
||||
export class AirbnbAdapter extends PlatformAdapter {
|
||||
readonly platformId = 'airbnb';
|
||||
readonly displayName = 'Airbnb';
|
||||
|
||||
private browser: Browser | null = null;
|
||||
private context: BrowserContext | null = null;
|
||||
private page: Page | null = null;
|
||||
|
||||
// ── Browser Lifecycle ──────────────────────────────────────────────────
|
||||
|
||||
async ensureBrowser(): Promise<Page> {
|
||||
if (this.page && !this.page.isClosed()) {
|
||||
return this.page;
|
||||
}
|
||||
|
||||
if (!this.browser || !this.browser.isConnected()) {
|
||||
const headless = isHeadless();
|
||||
console.log(`[airbnb] Launching browser (headless=${headless}, env=${process.env.AIRBNB_HEADLESS})`);
|
||||
this.browser = await chromium.launch({
|
||||
headless,
|
||||
args: ['--disable-blink-features=AutomationControlled'],
|
||||
});
|
||||
}
|
||||
|
||||
// Use persistent context to maintain cookies/session across runs
|
||||
this.context = await this.browser.newContext({
|
||||
userAgent:
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
viewport: { width: 1440, height: 900 },
|
||||
locale: 'en-US',
|
||||
timezoneId: 'America/New_York',
|
||||
});
|
||||
|
||||
// Try to restore saved cookies
|
||||
try {
|
||||
const fs = await import('fs');
|
||||
const cookiePath = `${SESSION_DIR}/cookies.json`;
|
||||
if (fs.existsSync(cookiePath)) {
|
||||
const cookies = JSON.parse(fs.readFileSync(cookiePath, 'utf-8'));
|
||||
await this.context.addCookies(cookies);
|
||||
console.log('[airbnb] Restored saved session cookies');
|
||||
}
|
||||
} catch {
|
||||
// No saved cookies, that's fine
|
||||
}
|
||||
|
||||
this.page = await this.context.newPage();
|
||||
return this.page;
|
||||
}
|
||||
|
||||
private async saveCookies(): Promise<void> {
|
||||
if (!this.context) return;
|
||||
try {
|
||||
const fs = await import('fs');
|
||||
const cookies = await this.context.cookies();
|
||||
fs.mkdirSync(SESSION_DIR, { recursive: true });
|
||||
fs.writeFileSync(`${SESSION_DIR}/cookies.json`, JSON.stringify(cookies, null, 2));
|
||||
console.log('[airbnb] Session cookies saved');
|
||||
} catch (err) {
|
||||
console.warn('[airbnb] Failed to save cookies:', err);
|
||||
}
|
||||
}
|
||||
|
||||
async closeBrowser(): Promise<void> {
|
||||
await this.saveCookies();
|
||||
if (this.page && !this.page.isClosed()) await this.page.close();
|
||||
if (this.context) await this.context.close();
|
||||
if (this.browser) await this.browser.close();
|
||||
this.page = null;
|
||||
this.context = null;
|
||||
this.browser = null;
|
||||
}
|
||||
|
||||
// ── Adapter Interface ──────────────────────────────────────────────────
|
||||
|
||||
async login(credentials: { email: string; password: string }): Promise<void> {
|
||||
const page = await this.ensureBrowser();
|
||||
await loginFlow(page, credentials.email, credentials.password);
|
||||
await this.saveCookies();
|
||||
}
|
||||
|
||||
async isSessionValid(): Promise<boolean> {
|
||||
try {
|
||||
const page = await this.ensureBrowser();
|
||||
return await checkSessionFlow(page);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async scrapePerformanceMetrics(): Promise<any> {
|
||||
const page = await this.ensureBrowser();
|
||||
|
||||
// Ensure session is valid first
|
||||
const valid = await checkSessionFlow(page);
|
||||
if (!valid) {
|
||||
throw new Error('Airbnb session is not valid. Please log in first.');
|
||||
}
|
||||
|
||||
return await scrapePerformanceFlow(page);
|
||||
}
|
||||
|
||||
async scrapeReservations(): Promise<any[]> {
|
||||
const page = await this.ensureBrowser();
|
||||
|
||||
const valid = await checkSessionFlow(page);
|
||||
if (!valid) {
|
||||
throw new Error('Airbnb session is not valid. Please log in first.');
|
||||
}
|
||||
|
||||
return await scrapeReservationsFlow(page);
|
||||
}
|
||||
|
||||
async scrapePricing(dateRange: { from: string; to: string }): Promise<any[]> {
|
||||
const page = await this.ensureBrowser();
|
||||
|
||||
const valid = await checkSessionFlow(page);
|
||||
if (!valid) {
|
||||
throw new Error('Airbnb session is not valid. Please log in first.');
|
||||
}
|
||||
|
||||
return await scrapePricingFlow(page, dateRange);
|
||||
}
|
||||
|
||||
async previewPriceChanges(changes: any[]): Promise<any> {
|
||||
// Preview doesn't actually apply — just compute diffs
|
||||
return {
|
||||
platformId: this.platformId,
|
||||
previewedAt: new Date().toISOString(),
|
||||
changesCount: changes.length,
|
||||
changes: changes.map((c) => ({
|
||||
date: c.date,
|
||||
currentPrice: c.currentPrice ?? 0,
|
||||
proposedPrice: c.newPrice,
|
||||
diff: c.newPrice - (c.currentPrice ?? 0),
|
||||
diffPercent:
|
||||
c.currentPrice > 0
|
||||
? Number((((c.newPrice - c.currentPrice) / c.currentPrice) * 100).toFixed(1))
|
||||
: 0,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
async applyPriceChanges(changes: any[]): Promise<any> {
|
||||
const page = await this.ensureBrowser();
|
||||
|
||||
const valid = await checkSessionFlow(page);
|
||||
if (!valid) {
|
||||
throw new Error('Airbnb session is not valid. Please log in first.');
|
||||
}
|
||||
|
||||
const results = await applyPriceChangesFlow(page, changes);
|
||||
const appliedCount = results.filter((r) => r.applied).length;
|
||||
|
||||
return {
|
||||
platformId: this.platformId,
|
||||
appliedAt: new Date().toISOString(),
|
||||
success: appliedCount === changes.length,
|
||||
appliedCount,
|
||||
results,
|
||||
};
|
||||
}
|
||||
|
||||
async selfTest(): Promise<{
|
||||
platformId: string;
|
||||
healthy: boolean;
|
||||
message: string;
|
||||
checkedAt: string;
|
||||
}> {
|
||||
try {
|
||||
const page = await this.ensureBrowser();
|
||||
const sessionValid = await checkSessionFlow(page);
|
||||
|
||||
if (sessionValid) {
|
||||
return {
|
||||
platformId: this.platformId,
|
||||
healthy: true,
|
||||
message: 'Airbnb adapter operational — session active',
|
||||
checkedAt: new Date().toISOString(),
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
platformId: this.platformId,
|
||||
healthy: false,
|
||||
message: 'Airbnb adapter operational but session expired — login required',
|
||||
checkedAt: new Date().toISOString(),
|
||||
};
|
||||
} catch (err: any) {
|
||||
return {
|
||||
platformId: this.platformId,
|
||||
healthy: false,
|
||||
message: `Airbnb adapter error: ${err.message}`,
|
||||
checkedAt: new Date().toISOString(),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
663
apps/scraper/src/adapters/airbnb/airbnb.flows.ts
Normal file
663
apps/scraper/src/adapters/airbnb/airbnb.flows.ts
Normal file
@@ -0,0 +1,663 @@
|
||||
import type { Page } from 'playwright';
|
||||
import { URLS, SELECTORS } from './airbnb.selectors.js';
|
||||
|
||||
// ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
async function waitForNavigation(page: Page, timeoutMs = 15000): Promise<void> {
|
||||
await page.waitForLoadState('networkidle', { timeout: timeoutMs }).catch(() => {
|
||||
// networkidle can be flaky; fall back to domcontentloaded
|
||||
});
|
||||
}
|
||||
|
||||
function parseCurrency(text: string): number {
|
||||
return Number(text.replace(/[^0-9.\-]/g, '')) || 0;
|
||||
}
|
||||
|
||||
function parseDate(text: string): string {
|
||||
// Handle Airbnb date formats:
|
||||
// "May 21, 2026" — standard
|
||||
// "Mar 2, 20269:09 PM ET" — time glued to year
|
||||
// "Mar 2, 2026 9:09 PM ET" — time with space
|
||||
|
||||
// Strip time portion (everything after the 4-digit year)
|
||||
const cleaned = text.replace(/(\d{4})\d{1,2}:\d{2}.*/, '$1').replace(/(\d{4})\s+\d{1,2}:\d{2}.*/, '$1').trim();
|
||||
const d = new Date(cleaned);
|
||||
if (!isNaN(d.getTime())) {
|
||||
return d.toISOString().split('T')[0];
|
||||
}
|
||||
// Also try the original text
|
||||
const d2 = new Date(text);
|
||||
if (!isNaN(d2.getTime())) {
|
||||
return d2.toISOString().split('T')[0];
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
function computeNights(checkIn: string, checkOut: string): number {
|
||||
const a = new Date(checkIn);
|
||||
const b = new Date(checkOut);
|
||||
return Math.max(1, Math.round((b.getTime() - a.getTime()) / 86400000));
|
||||
}
|
||||
|
||||
// ── Login Flow ───────────────────────────────────────────────────────────────
|
||||
|
||||
export async function loginFlow(
|
||||
page: Page,
|
||||
email: string,
|
||||
password: string,
|
||||
): Promise<void> {
|
||||
console.log('[airbnb] Navigating to login page...');
|
||||
await page.goto('https://www.airbnb.com/login', { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Take a screenshot of what we see for debugging
|
||||
console.log('[airbnb] Login page loaded, URL:', page.url());
|
||||
|
||||
// Click "Continue with email" if present (Airbnb sometimes shows social login first)
|
||||
for (const text of ['Continue with email', 'Email', 'Use email']) {
|
||||
const btn = page.locator(`button:has-text("${text}")`).first();
|
||||
if (await btn.isVisible({ timeout: 2000 }).catch(() => false)) {
|
||||
console.log(`[airbnb] Clicking "${text}" button`);
|
||||
await btn.click();
|
||||
await page.waitForTimeout(2000);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Look for email input with broader selectors
|
||||
const emailInput = page.locator(
|
||||
'input[type="email"], input[name="email"], input[autocomplete="email"], input[autocomplete="username"], input[data-testid*="email"]'
|
||||
).first();
|
||||
|
||||
// If no email input visible, try clicking any visible text input
|
||||
if (!(await emailInput.isVisible({ timeout: 5000 }).catch(() => false))) {
|
||||
console.log('[airbnb] Email input not found with standard selectors, trying text input...');
|
||||
const textInput = page.locator('input[type="text"]').first();
|
||||
if (await textInput.isVisible({ timeout: 3000 }).catch(() => false)) {
|
||||
await textInput.fill(email);
|
||||
} else {
|
||||
// Last resort: log page content for debugging
|
||||
const bodyText = await page.locator('body').textContent().catch(() => '');
|
||||
console.log('[airbnb] Page text preview:', bodyText?.substring(0, 500));
|
||||
throw new Error('Could not find email input on login page. The page structure may have changed.');
|
||||
}
|
||||
} else {
|
||||
console.log('[airbnb] Found email input, filling...');
|
||||
await emailInput.fill(email);
|
||||
}
|
||||
|
||||
// Click Continue / Next / Submit after email
|
||||
for (const selector of [
|
||||
'button:has-text("Continue")',
|
||||
'button:has-text("Next")',
|
||||
'button[type="submit"]',
|
||||
]) {
|
||||
const btn = page.locator(selector).first();
|
||||
if (await btn.isVisible({ timeout: 2000 }).catch(() => false)) {
|
||||
console.log(`[airbnb] Clicking: ${selector}`);
|
||||
await btn.click();
|
||||
await page.waitForTimeout(3000);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Enter password
|
||||
const passwordInput = page.locator(
|
||||
'input[type="password"], input[name="password"], input[autocomplete="current-password"]'
|
||||
).first();
|
||||
|
||||
if (await passwordInput.isVisible({ timeout: 10000 }).catch(() => false)) {
|
||||
console.log('[airbnb] Found password input, filling...');
|
||||
await passwordInput.fill(password);
|
||||
|
||||
// Submit the password form
|
||||
for (const selector of [
|
||||
'button:has-text("Log in")',
|
||||
'button:has-text("Login")',
|
||||
'button:has-text("Continue")',
|
||||
'button[type="submit"]',
|
||||
]) {
|
||||
const btn = page.locator(selector).first();
|
||||
if (await btn.isVisible({ timeout: 2000 }).catch(() => false)) {
|
||||
console.log(`[airbnb] Submitting with: ${selector}`);
|
||||
await btn.click();
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
console.log('[airbnb] No password field — may be a passwordless flow or MFA-only');
|
||||
}
|
||||
|
||||
// Wait for either MFA prompt or successful redirect
|
||||
await page.waitForTimeout(5000);
|
||||
console.log('[airbnb] Post-submit URL:', page.url());
|
||||
|
||||
// Check for MFA
|
||||
const mfaInput = page.locator('input[inputmode="numeric"], input[autocomplete="one-time-code"], input[name*="code"]').first();
|
||||
if (await mfaInput.isVisible({ timeout: 5000 }).catch(() => false)) {
|
||||
console.log('[airbnb] MFA required. Waiting for manual code entry (up to 3 minutes)...');
|
||||
await page.waitForURL(/\/(hosting|dashboard|account|users)/, { timeout: 180000 }).catch(() => {
|
||||
throw new Error('MFA timeout: code was not entered within 3 minutes');
|
||||
});
|
||||
}
|
||||
|
||||
// Check if we're on a logged-in page
|
||||
const currentUrl = page.url();
|
||||
if (currentUrl.includes('/login') || currentUrl.includes('/signup')) {
|
||||
// Still on login page — maybe waiting for user action in non-headless mode
|
||||
console.log('[airbnb] Still on login page. Waiting for user to complete login (up to 3 minutes)...');
|
||||
await page.waitForURL(/\/(hosting|dashboard|account|users)/, { timeout: 180000 }).catch(() => {
|
||||
throw new Error('Login timeout: did not reach a logged-in page within 3 minutes');
|
||||
});
|
||||
}
|
||||
|
||||
console.log('[airbnb] Login successful, URL:', page.url());
|
||||
}
|
||||
|
||||
// ── Session Check ────────────────────────────────────────────────────────────
|
||||
|
||||
export async function checkSessionFlow(page: Page): Promise<boolean> {
|
||||
try {
|
||||
await page.goto(URLS.HOST_HOME, { waitUntil: 'domcontentloaded', timeout: 15000 });
|
||||
await page.waitForTimeout(2000);
|
||||
|
||||
// If we see the host nav, we're logged in
|
||||
const nav = page.locator(SELECTORS.NAV_PRIMARY);
|
||||
return await nav.isVisible({ timeout: 5000 }).catch(() => false);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Discover Listing ID ──────────────────────────────────────────────────────
|
||||
|
||||
export async function discoverListingId(page: Page): Promise<string> {
|
||||
await page.goto(URLS.LISTINGS, { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Try multiple selector patterns for finding the listing ID
|
||||
const selectors = [
|
||||
'a[href*="/hosting/listings/editor/"]',
|
||||
'a[href*="/multicalendar/"]',
|
||||
'a[href*="/hosting/listings/"]',
|
||||
'a[href*="/rooms/"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const link = page.locator(selector).first();
|
||||
if (await link.isVisible({ timeout: 3000 }).catch(() => false)) {
|
||||
const href = await link.getAttribute('href');
|
||||
if (!href) continue;
|
||||
|
||||
// Extract numeric ID from various URL patterns
|
||||
const match = href.match(/\/(?:editor|multicalendar|rooms|listings)\/(\d+)/);
|
||||
if (match) {
|
||||
console.log(`[airbnb] Discovered listing ID: ${match[1]} (from ${selector})`);
|
||||
return match[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: scan all links on the page for any numeric ID pattern
|
||||
const allHrefs = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('a[href]'))
|
||||
.map((a) => a.getAttribute('href') || '')
|
||||
.filter((h) => /\/\d{5,}/.test(h));
|
||||
});
|
||||
|
||||
for (const href of allHrefs) {
|
||||
const match = href.match(/\/(\d{5,})/);
|
||||
if (match) {
|
||||
console.log(`[airbnb] Discovered listing ID: ${match[1]} (from page scan: ${href})`);
|
||||
return match[1];
|
||||
}
|
||||
}
|
||||
|
||||
// Last resort: check the calendar URL pattern from nav
|
||||
const calendarLink = page.locator('a[href*="/calendar"]').first();
|
||||
if (await calendarLink.isVisible({ timeout: 2000 }).catch(() => false)) {
|
||||
const href = await calendarLink.getAttribute('href');
|
||||
const match = href?.match(/\/(\d{5,})/);
|
||||
if (match) {
|
||||
console.log(`[airbnb] Discovered listing ID: ${match[1]} (from calendar nav)`);
|
||||
return match[1];
|
||||
}
|
||||
}
|
||||
|
||||
// Hardcoded fallback from initial DOM exploration
|
||||
const fallbackId = process.env.AIRBNB_LISTING_ID;
|
||||
if (fallbackId) {
|
||||
console.log(`[airbnb] Using fallback listing ID from env: ${fallbackId}`);
|
||||
return fallbackId;
|
||||
}
|
||||
|
||||
throw new Error('Could not find listing ID from listings page. Set AIRBNB_LISTING_ID env var as fallback.');
|
||||
}
|
||||
|
||||
// ── Scrape Performance Metrics ───────────────────────────────────────────────
|
||||
|
||||
export async function scrapePerformanceFlow(page: Page): Promise<any> {
|
||||
// First get listing ID for views page
|
||||
let listingId: string;
|
||||
try {
|
||||
listingId = await discoverListingId(page);
|
||||
} catch {
|
||||
listingId = '';
|
||||
}
|
||||
|
||||
// ── Scrape Earnings / Performance ────────────────────────────────────
|
||||
await page.goto(URLS.EARNINGS_PERFORMANCE, { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
const earningsText = await page.locator('main').textContent() ?? '';
|
||||
|
||||
// Parse monthly earnings — look for the summary card
|
||||
let revenueTotal = 0;
|
||||
let nightsBooked = 0;
|
||||
let avgNightStay = 0;
|
||||
|
||||
// Parse "Total (USD)" amount
|
||||
const totalMatch = earningsText.match(/Total \(USD\)\s*\$?([\d,]+\.?\d*)/);
|
||||
if (totalMatch) {
|
||||
revenueTotal = parseCurrency(totalMatch[1]);
|
||||
} else {
|
||||
// Fallback: look for the first currency amount after "Paid"
|
||||
const paidMatch = earningsText.match(/\$([\d,]+\.?\d*)\s*Paid/);
|
||||
if (paidMatch) revenueTotal = parseCurrency(paidMatch[1]);
|
||||
}
|
||||
|
||||
// Expand Performance stats if collapsed
|
||||
const perfStatsBtn = page.locator('button:has-text("Performance stats")');
|
||||
if (await perfStatsBtn.isVisible({ timeout: 2000 }).catch(() => false)) {
|
||||
await perfStatsBtn.click();
|
||||
await page.waitForTimeout(500);
|
||||
}
|
||||
|
||||
const updatedText = await page.locator('main').textContent() ?? '';
|
||||
const nightsMatch = updatedText.match(/(\d+)\s*Nights? booked/i);
|
||||
if (nightsMatch) nightsBooked = parseInt(nightsMatch[1], 10);
|
||||
const avgStayMatch = updatedText.match(/(\d+)\s*Avg night stay/i);
|
||||
if (avgStayMatch) avgNightStay = parseInt(avgStayMatch[1], 10);
|
||||
|
||||
// ── Scrape Insights / Views ──────────────────────────────────────────
|
||||
let viewsSearch = 0;
|
||||
let newBookings = 0;
|
||||
let bookingRate = 0;
|
||||
|
||||
if (listingId) {
|
||||
await page.goto(URLS.INSIGHTS_VIEWS(listingId), { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
const viewsPageText = await page.locator('main').textContent() ?? '';
|
||||
|
||||
// "161" Views, past 30 days
|
||||
const viewsMatch = viewsPageText.match(/(\d+)\s*Views,?\s*past 30 days/i);
|
||||
if (viewsMatch) viewsSearch = parseInt(viewsMatch[1], 10);
|
||||
|
||||
// "2" New bookings, past 30 days
|
||||
const bookingsMatch = viewsPageText.match(/(\d+)\s*New bookings,?\s*past 30 days/i);
|
||||
if (bookingsMatch) newBookings = parseInt(bookingsMatch[1], 10);
|
||||
|
||||
// "1.2%" Booking rate
|
||||
const rateMatch = viewsPageText.match(/([\d.]+)%\s*Booking rate/i);
|
||||
if (rateMatch) bookingRate = parseFloat(rateMatch[1]);
|
||||
}
|
||||
|
||||
// ── Scrape Insights / Reviews ────────────────────────────────────────
|
||||
let overallRating = 0;
|
||||
let reviewCount = 0;
|
||||
|
||||
await page.goto(URLS.INSIGHTS_REVIEWS, { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
const reviewsText = await page.locator('main').textContent() ?? '';
|
||||
|
||||
const ratingMatch = reviewsText.match(/([\d.]+)\s*overall rating/i);
|
||||
if (ratingMatch) overallRating = parseFloat(ratingMatch[1]);
|
||||
|
||||
const reviewCountMatch = reviewsText.match(/(\d+)\s*reviews/i);
|
||||
if (reviewCountMatch) reviewCount = parseInt(reviewCountMatch[1], 10);
|
||||
|
||||
// ── Compute derived metrics ──────────────────────────────────────────
|
||||
const avgDailyRate = nightsBooked > 0 ? revenueTotal / nightsBooked : 0;
|
||||
// Occupancy: nights booked / 30 days * 100
|
||||
const occupancyRate = (nightsBooked / 30) * 100;
|
||||
|
||||
return {
|
||||
platformId: 'airbnb',
|
||||
capturedAt: new Date().toISOString(),
|
||||
periodLabel: 'last_30_days',
|
||||
viewsSearch,
|
||||
viewsListing: viewsSearch, // Airbnb doesn't split search vs listing views
|
||||
conversionRate: bookingRate,
|
||||
bookingsCount: newBookings || nightsBooked,
|
||||
occupancyRate: Number(occupancyRate.toFixed(1)),
|
||||
avgDailyRate: Number(avgDailyRate.toFixed(2)),
|
||||
revenueTotal: Number(revenueTotal.toFixed(2)),
|
||||
rawJson: {
|
||||
source: 'airbnb',
|
||||
scrapedAt: new Date().toISOString(),
|
||||
listingId,
|
||||
nightsBooked,
|
||||
avgNightStay,
|
||||
overallRating,
|
||||
reviewCount,
|
||||
bookingRate,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ── Scrape Reservations ──────────────────────────────────────────────────────
|
||||
|
||||
export async function scrapeReservationsFlow(page: Page): Promise<any[]> {
|
||||
const reservations: any[] = [];
|
||||
|
||||
// Scrape both completed and upcoming
|
||||
for (const url of [URLS.RESERVATIONS_COMPLETED, URLS.RESERVATIONS]) {
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Wait for table to appear
|
||||
const table = page.locator('table');
|
||||
if (!(await table.isVisible({ timeout: 5000 }).catch(() => false))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get all rows
|
||||
const rows = page.locator('table tbody tr, table tr').filter({
|
||||
has: page.locator('td'),
|
||||
});
|
||||
const rowCount = await rows.count();
|
||||
|
||||
for (let i = 0; i < rowCount; i++) {
|
||||
try {
|
||||
const row = rows.nth(i);
|
||||
const cells = row.locator('td');
|
||||
const cellCount = await cells.count();
|
||||
if (cellCount < 8) continue;
|
||||
|
||||
// Actual Airbnb columns (10 cols):
|
||||
// [0]=Status [1]=Guests [2]=Contact [3]=Check-in [4]=Checkout
|
||||
// [5]=Booked [6]=Listing [7]=Confirmation Code [8]=Total Payout [9]=Actions
|
||||
const status = (await cells.nth(0).textContent())?.trim().toLowerCase() ?? '';
|
||||
const guestText = (await cells.nth(1).textContent())?.trim() ?? '';
|
||||
const checkInText = (await cells.nth(3).textContent())?.trim() ?? '';
|
||||
const checkOutText = (await cells.nth(4).textContent())?.trim() ?? '';
|
||||
const bookedText = (await cells.nth(5).textContent())?.trim() ?? '';
|
||||
const confirmationCode = (await cells.nth(7).textContent())?.trim() ?? '';
|
||||
const payoutText = cellCount > 8 ? (await cells.nth(8).textContent())?.trim() ?? '' : '';
|
||||
|
||||
// Parse guest name from profile link (clean text) or fall back to cell text
|
||||
let guestName = 'Unknown';
|
||||
const profileLink = cells.nth(1).locator('a[href*="/users/profile/"], a[href*="/users/show/"]').first();
|
||||
if (await profileLink.count() > 0) {
|
||||
guestName = (await profileLink.textContent())?.trim() || 'Unknown';
|
||||
}
|
||||
if (guestName === 'Unknown' || /\d+\s*(adult|guest)/i.test(guestName)) {
|
||||
// Fallback: split merged "NameNadults" text — e.g., "Cassie Graham7 adults"
|
||||
const nameMatch = guestText.match(/^(.+?)(\d+\s*(?:adult|guest|infant|child|pet))/i);
|
||||
guestName = nameMatch ? nameMatch[1].trim() : guestText.split('\n')[0]?.trim() || 'Unknown';
|
||||
}
|
||||
const guestsCountMatch = guestText.match(/(\d+)\s*(adult|guest)/i);
|
||||
const guestsCount = guestsCountMatch ? parseInt(guestsCountMatch[1], 10) : 1;
|
||||
|
||||
const checkIn = parseDate(checkInText);
|
||||
const checkOut = parseDate(checkOutText);
|
||||
const nights = computeNights(checkIn, checkOut);
|
||||
const totalPayout = parseCurrency(payoutText);
|
||||
|
||||
// Map status
|
||||
let mappedStatus: string;
|
||||
if (status.includes('past guest') || status.includes('completed')) {
|
||||
mappedStatus = 'completed';
|
||||
} else if (status.includes('confirmed') || status.includes('upcoming')) {
|
||||
mappedStatus = 'confirmed';
|
||||
} else if (status.includes('cancel')) {
|
||||
mappedStatus = 'cancelled';
|
||||
} else if (status.includes('check')) {
|
||||
mappedStatus = 'checked_in';
|
||||
} else {
|
||||
mappedStatus = status || 'unknown';
|
||||
}
|
||||
|
||||
// Estimate nightly rate from total payout (total / nights is rough)
|
||||
const nightlyRate = nights > 0 ? Number((totalPayout / nights).toFixed(2)) : 0;
|
||||
|
||||
// Safely parse bookedAt — fallback to now if invalid
|
||||
let bookedAt: string;
|
||||
try {
|
||||
const parsed = new Date(parseDate(bookedText));
|
||||
bookedAt = isNaN(parsed.getTime()) ? new Date().toISOString() : parsed.toISOString();
|
||||
} catch {
|
||||
bookedAt = new Date().toISOString();
|
||||
}
|
||||
|
||||
reservations.push({
|
||||
platformId: 'airbnb',
|
||||
platformReservationId: confirmationCode || `ABB-${Date.now()}-${i}`,
|
||||
guestName,
|
||||
checkIn,
|
||||
checkOut,
|
||||
nights,
|
||||
guestsCount,
|
||||
nightlyRate,
|
||||
cleaningFee: 0, // Not available in table view; could be scraped from detail
|
||||
platformFee: 0, // Not available in table view
|
||||
totalPayout,
|
||||
status: mappedStatus,
|
||||
bookedAt,
|
||||
rawJson: {
|
||||
source: 'airbnb',
|
||||
confirmationCode,
|
||||
scrapedAt: new Date().toISOString(),
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
console.warn(`[airbnb] Failed to parse reservation row ${i}:`, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate by confirmation code
|
||||
const seen = new Set<string>();
|
||||
const unique = reservations.filter((r) => {
|
||||
if (seen.has(r.platformReservationId)) return false;
|
||||
seen.add(r.platformReservationId);
|
||||
return true;
|
||||
});
|
||||
|
||||
console.log(`[airbnb] Scraped ${unique.length} reservations`);
|
||||
return unique.sort((a, b) => new Date(a.checkIn).getTime() - new Date(b.checkIn).getTime());
|
||||
}
|
||||
|
||||
// ── Scrape Pricing from Calendar ─────────────────────────────────────────────
|
||||
|
||||
export async function scrapePricingFlow(
|
||||
page: Page,
|
||||
dateRange: { from: string; to: string },
|
||||
): Promise<any[]> {
|
||||
const listingId = await discoverListingId(page);
|
||||
await page.goto(URLS.CALENDAR(listingId), { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
// Airbnb's multicalendar uses a virtualized scroll list (virtuoso).
|
||||
// Day text follows pattern: "Wednesday 4 Mar4Nightly price$275"
|
||||
// or "Unavailable" for blocked days, or no price text for reserved days.
|
||||
// We parse the main text content to extract date-price pairs.
|
||||
|
||||
const prices: any[] = [];
|
||||
const targetEnd = new Date(dateRange.to);
|
||||
const targetStart = new Date(dateRange.from);
|
||||
let scrollAttempts = 0;
|
||||
const maxScrolls = 20;
|
||||
|
||||
// Determine current year from page context
|
||||
const currentYear = new Date().getFullYear();
|
||||
|
||||
while (scrollAttempts < maxScrolls) {
|
||||
const dayData = await page.evaluate((year: number) => {
|
||||
const results: { dateText: string; price: number; available: boolean }[] = [];
|
||||
const mainEl = document.querySelector('[data-testid="listing-calendar"]') || document.querySelector('main');
|
||||
if (!mainEl) return results;
|
||||
|
||||
const text = mainEl.textContent || '';
|
||||
|
||||
// Match day entries like "Sunday 1 Feb1Nightly price$275" or "Monday 20 Mar20UnavailableNightly price$275"
|
||||
// The pattern is: DayName DD MonDD[Unavailable][Nightly price$NNN]
|
||||
// Month headers appear as standalone month names like "February", "March", etc.
|
||||
const months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
|
||||
const fullMonths = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'];
|
||||
const dayNames = '(?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday)';
|
||||
const monthAbbr = '(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)';
|
||||
|
||||
// Match each day entry
|
||||
const dayRegex = new RegExp(
|
||||
`(?:Today, )?${dayNames}\\s+(\\d{1,2})\\s+(${monthAbbr})\\1(Unavailable)?(?:Nightly price\\$(\\d+))?`,
|
||||
'g',
|
||||
);
|
||||
|
||||
let match;
|
||||
while ((match = dayRegex.exec(text)) !== null) {
|
||||
const day = parseInt(match[1], 10);
|
||||
const monthAbbreviation = match[2];
|
||||
const isUnavailable = !!match[3];
|
||||
const price = match[4] ? parseInt(match[4], 10) : 0;
|
||||
|
||||
const monthIndex = months.indexOf(monthAbbreviation);
|
||||
if (monthIndex === -1) continue;
|
||||
|
||||
// Determine the year — if month is before current month, it might be next year
|
||||
const currentMonth = new Date().getMonth();
|
||||
let dateYear = year;
|
||||
if (monthIndex < currentMonth - 1) {
|
||||
dateYear = year + 1;
|
||||
}
|
||||
|
||||
const dateStr = `${dateYear}-${String(monthIndex + 1).padStart(2, '0')}-${String(day).padStart(2, '0')}`;
|
||||
results.push({
|
||||
dateText: dateStr,
|
||||
price,
|
||||
available: !isUnavailable && price > 0,
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}, currentYear);
|
||||
|
||||
for (const d of dayData) {
|
||||
if (d.dateText >= dateRange.from && d.dateText <= dateRange.to) {
|
||||
prices.push({
|
||||
platformId: 'airbnb',
|
||||
date: d.dateText,
|
||||
price: d.price,
|
||||
isAvailable: d.available,
|
||||
minStayNights: 3,
|
||||
syncedAt: new Date().toISOString(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we've reached the target end date
|
||||
const latestDate = dayData.length > 0
|
||||
? new Date(dayData[dayData.length - 1].dateText)
|
||||
: new Date();
|
||||
|
||||
if (latestDate >= targetEnd) break;
|
||||
|
||||
// Scroll the virtuoso scroller down to load more months
|
||||
const scrolled = await page.evaluate(() => {
|
||||
const scroller = document.querySelector('[data-testid="virtuoso-scroller"]');
|
||||
if (scroller) {
|
||||
const prevTop = scroller.scrollTop;
|
||||
scroller.scrollTop += 800;
|
||||
return scroller.scrollTop > prevTop;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
if (!scrolled) {
|
||||
// Try clicking next month button as fallback
|
||||
const nextBtn = page.locator('button[aria-label*="Move forward"], button[aria-label*="next month"]').first();
|
||||
if (await nextBtn.isVisible({ timeout: 2000 }).catch(() => false)) {
|
||||
await nextBtn.click();
|
||||
await page.waitForTimeout(2000);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
await page.waitForTimeout(1500);
|
||||
}
|
||||
|
||||
scrollAttempts++;
|
||||
}
|
||||
|
||||
// Deduplicate by date
|
||||
const seen = new Set<string>();
|
||||
const unique = prices.filter((p) => {
|
||||
if (seen.has(p.date)) return false;
|
||||
seen.add(p.date);
|
||||
return true;
|
||||
});
|
||||
|
||||
console.log(`[airbnb] Scraped ${unique.length} daily prices`);
|
||||
return unique.sort((a, b) => a.date.localeCompare(b.date));
|
||||
}
|
||||
|
||||
// ── Apply Price Changes via Calendar ─────────────────────────────────────────
|
||||
|
||||
export async function applyPriceChangesFlow(
|
||||
page: Page,
|
||||
changes: Array<{ date: string; newPrice: number }>,
|
||||
): Promise<Array<{ date: string; newPrice: number; applied: boolean; error?: string }>> {
|
||||
const listingId = await discoverListingId(page);
|
||||
const results: Array<{ date: string; newPrice: number; applied: boolean; error?: string }> = [];
|
||||
|
||||
for (const change of changes) {
|
||||
try {
|
||||
// Navigate to the calendar
|
||||
await page.goto(URLS.CALENDAR(listingId), { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(2000);
|
||||
|
||||
// Click on the specific date cell
|
||||
const dateObj = new Date(change.date);
|
||||
const label = dateObj.toLocaleDateString('en-US', {
|
||||
month: 'long',
|
||||
day: 'numeric',
|
||||
year: 'numeric',
|
||||
});
|
||||
|
||||
const dayCell = page.locator(`td[aria-label*="${label}"], td:has-text("${dateObj.getDate()}")`).first();
|
||||
if (!(await dayCell.isVisible({ timeout: 3000 }).catch(() => false))) {
|
||||
results.push({ ...change, applied: false, error: 'Date cell not found' });
|
||||
continue;
|
||||
}
|
||||
|
||||
await dayCell.click();
|
||||
await page.waitForTimeout(1000);
|
||||
|
||||
// Look for price input in the sidebar
|
||||
const priceInput = page.locator('input[aria-label*="price"], input[name*="price"]').first();
|
||||
if (!(await priceInput.isVisible({ timeout: 3000 }).catch(() => false))) {
|
||||
results.push({ ...change, applied: false, error: 'Price input not found' });
|
||||
continue;
|
||||
}
|
||||
|
||||
await priceInput.click({ clickCount: 3 }); // Select all
|
||||
await priceInput.fill(String(change.newPrice));
|
||||
|
||||
// Save
|
||||
const saveBtn = page.locator('button:has-text("Save")').first();
|
||||
if (await saveBtn.isVisible({ timeout: 2000 }).catch(() => false)) {
|
||||
await saveBtn.click();
|
||||
await page.waitForTimeout(2000);
|
||||
results.push({ ...change, applied: true });
|
||||
} else {
|
||||
results.push({ ...change, applied: false, error: 'Save button not found' });
|
||||
}
|
||||
} catch (err: any) {
|
||||
results.push({ ...change, applied: false, error: err.message });
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
86
apps/scraper/src/adapters/airbnb/airbnb.selectors.ts
Normal file
86
apps/scraper/src/adapters/airbnb/airbnb.selectors.ts
Normal file
@@ -0,0 +1,86 @@
|
||||
// Airbnb Host Dashboard selectors mapped from live DOM exploration (March 2026)
|
||||
// These target the accessibility tree structure rather than fragile CSS classes.
|
||||
|
||||
export const URLS = {
|
||||
HOST_HOME: 'https://www.airbnb.com/hosting',
|
||||
CALENDAR: (listingId: string) => `https://www.airbnb.com/multicalendar/${listingId}`,
|
||||
LISTINGS: 'https://www.airbnb.com/hosting/listings',
|
||||
LISTING_EDITOR: (listingId: string) => `https://www.airbnb.com/hosting/listings/editor/${listingId}/details/photo-tour`,
|
||||
RESERVATIONS: 'https://www.airbnb.com/hosting/reservations',
|
||||
RESERVATIONS_COMPLETED: 'https://www.airbnb.com/hosting/reservations/completed',
|
||||
RESERVATIONS_ALL: 'https://www.airbnb.com/hosting/reservations/all',
|
||||
RESERVATION_DETAIL: (confirmationCode: string) => `https://www.airbnb.com/hosting/reservations/details/${confirmationCode}`,
|
||||
EARNINGS_PERFORMANCE: 'https://www.airbnb.com/users/transaction_history',
|
||||
INSIGHTS_REVIEWS: 'https://www.airbnb.com/progress/reviews',
|
||||
INSIGHTS_VIEWS: (listingId: string) => `https://www.airbnb.com/progress/views/${listingId}`,
|
||||
} as const;
|
||||
|
||||
export const SELECTORS = {
|
||||
// ── Navigation ───────────────────────────────────────────────────────
|
||||
NAV_PRIMARY: 'nav[aria-label="Primary"]',
|
||||
NAV_TODAY: 'a[href="/hosting"]',
|
||||
NAV_CALENDAR: 'a[href="/calendar-router"]',
|
||||
NAV_LISTINGS: 'a[href="/hosting/listings"]',
|
||||
NAV_MESSAGES: 'a[href="/hosting/messages"]',
|
||||
NAV_MENU_BUTTON: 'button[aria-label="Main navigation menu"]',
|
||||
|
||||
// ── Login ────────────────────────────────────────────────────────────
|
||||
LOGIN_EMAIL: 'input[type="email"], input[name="email"]',
|
||||
LOGIN_PASSWORD: 'input[type="password"], input[name="password"]',
|
||||
LOGIN_SUBMIT: 'button[type="submit"]',
|
||||
LOGIN_CONTINUE: 'button:has-text("Continue")',
|
||||
MFA_INPUT: 'input[inputmode="numeric"]',
|
||||
|
||||
// ── Reservations Page ────────────────────────────────────────────────
|
||||
RESERVATIONS_TAB_UPCOMING: 'tab:has-text("Upcoming"), button:has-text("Upcoming")',
|
||||
RESERVATIONS_TAB_COMPLETED: 'tab:has-text("Completed"), button:has-text("Completed")',
|
||||
RESERVATIONS_TAB_CANCELLED: 'tab:has-text("Cancelled"), button:has-text("Cancelled")',
|
||||
RESERVATIONS_TAB_ALL: 'tab:has-text("All"), button:has-text("All")',
|
||||
RESERVATIONS_TABLE: 'table',
|
||||
RESERVATIONS_TABLE_ROWS: 'table tbody tr',
|
||||
RESERVATIONS_DETAIL_BUTTON: 'button:has-text("Details"), a:has-text("Details")',
|
||||
RESERVATIONS_EXPORT_BUTTON: 'button:has-text("Export")',
|
||||
|
||||
// Reservation detail modal
|
||||
RESERVATION_DETAIL_MODAL: '[role="dialog"], [aria-modal="true"]',
|
||||
RESERVATION_DETAIL_CLOSE: 'button:has-text("×"), button[aria-label="Close"]',
|
||||
|
||||
// ── Earnings / Performance Page ──────────────────────────────────────
|
||||
EARNINGS_NAV_PERFORMANCE: 'text=Performance',
|
||||
EARNINGS_NAV_UPCOMING: 'text=Upcoming',
|
||||
EARNINGS_NAV_PAID: 'text=Paid',
|
||||
EARNINGS_NAV_REPORTS: 'text=Reports',
|
||||
EARNINGS_MONTH_LABEL: 'text=/^\\w+ \\d{4}$/', // e.g., "March 2026"
|
||||
EARNINGS_PAID_AMOUNT: 'text=/^\\$[\\d,]+\\.\\d{2}$/',
|
||||
EARNINGS_PERFORMANCE_STATS: 'button:has-text("Performance stats")',
|
||||
EARNINGS_PAID_BREAKDOWN: 'button:has-text("Paid breakdown")',
|
||||
|
||||
// ── Insights / Views Page ────────────────────────────────────────────
|
||||
INSIGHTS_TAB_REVIEWS: 'tab:has-text("Reviews"), button:has-text("Reviews")',
|
||||
INSIGHTS_TAB_VIEWS: 'tab:has-text("Views"), button:has-text("Views")',
|
||||
INSIGHTS_TAB_OPPORTUNITIES: 'tab:has-text("Opportunities"), button:has-text("Opportunities")',
|
||||
INSIGHTS_TAB_SUPERHOST: 'tab:has-text("Superhost"), button:has-text("Superhost")',
|
||||
INSIGHTS_VIEWS_COUNT: 'text=/^\\d+$/', // "161"
|
||||
INSIGHTS_VIEWS_LABEL: 'text="Views, past 30 days"',
|
||||
INSIGHTS_BOOKINGS_LABEL: 'text="New bookings, past 30 days"',
|
||||
INSIGHTS_BOOKING_RATE_LABEL: 'text="Booking rate"',
|
||||
INSIGHTS_OVERALL_RATING: 'text=/★ [\\d.]+ overall rating/',
|
||||
|
||||
// ── Calendar / Pricing ───────────────────────────────────────────────
|
||||
CALENDAR_DAY_CELL: 'td[data-testid]',
|
||||
CALENDAR_PRICE_DISPLAY: '[data-testid="price-item-container"]',
|
||||
CALENDAR_SIDEBAR: '[data-testid="calendar-sidebar"]',
|
||||
CALENDAR_PRICE_INPUT: 'input[aria-label*="price"], input[name*="price"]',
|
||||
CALENDAR_SAVE_BUTTON: 'button:has-text("Save")',
|
||||
CALENDAR_NEXT_MONTH: 'button[aria-label="Move forward to switch to the next month"]',
|
||||
CALENDAR_PREV_MONTH: 'button[aria-label="Move backward to switch to the previous month"]',
|
||||
|
||||
// ── Listings Page ────────────────────────────────────────────────────
|
||||
LISTING_CARD: 'a[href*="/hosting/listings/editor/"]',
|
||||
LISTING_STATUS_BADGE: 'text="Listed"',
|
||||
LISTING_TITLE: 'h1, [data-testid="listing-title"]',
|
||||
|
||||
// ── General ──────────────────────────────────────────────────────────
|
||||
LOADING_SPINNER: '[role="progressbar"], [aria-busy="true"]',
|
||||
PAGE_MAIN: 'main',
|
||||
} as const;
|
||||
31
apps/scraper/src/adapters/base/AdapterRegistry.ts
Normal file
31
apps/scraper/src/adapters/base/AdapterRegistry.ts
Normal file
@@ -0,0 +1,31 @@
|
||||
import type { PlatformAdapterInterface } from './PlatformAdapter.js';
|
||||
|
||||
class AdapterRegistry {
|
||||
private adapters = new Map<string, PlatformAdapterInterface>();
|
||||
|
||||
register(adapter: PlatformAdapterInterface): void {
|
||||
this.adapters.set(adapter.platformId, adapter);
|
||||
}
|
||||
|
||||
get(platformId: string): PlatformAdapterInterface | undefined {
|
||||
return this.adapters.get(platformId);
|
||||
}
|
||||
|
||||
getOrThrow(platformId: string): PlatformAdapterInterface {
|
||||
const adapter = this.adapters.get(platformId);
|
||||
if (!adapter) {
|
||||
throw new Error(`No adapter registered for platform: ${platformId}`);
|
||||
}
|
||||
return adapter;
|
||||
}
|
||||
|
||||
list(): string[] {
|
||||
return Array.from(this.adapters.keys());
|
||||
}
|
||||
|
||||
has(platformId: string): boolean {
|
||||
return this.adapters.has(platformId);
|
||||
}
|
||||
}
|
||||
|
||||
export const registry = new AdapterRegistry();
|
||||
36
apps/scraper/src/adapters/base/PlatformAdapter.ts
Normal file
36
apps/scraper/src/adapters/base/PlatformAdapter.ts
Normal file
@@ -0,0 +1,36 @@
|
||||
export interface PlatformAdapterInterface {
|
||||
readonly platformId: string;
|
||||
readonly displayName: string;
|
||||
login(credentials: { email: string; password: string }): Promise<void>;
|
||||
isSessionValid(): Promise<boolean>;
|
||||
scrapePerformanceMetrics(): Promise<any>;
|
||||
scrapeReservations(): Promise<any[]>;
|
||||
scrapePricing(dateRange: { from: string; to: string }): Promise<any[]>;
|
||||
previewPriceChanges(changes: any[]): Promise<any>;
|
||||
applyPriceChanges(changes: any[]): Promise<any>;
|
||||
selfTest(): Promise<{
|
||||
platformId: string;
|
||||
healthy: boolean;
|
||||
message: string;
|
||||
checkedAt: string;
|
||||
}>;
|
||||
}
|
||||
|
||||
export abstract class PlatformAdapter implements PlatformAdapterInterface {
|
||||
abstract readonly platformId: string;
|
||||
abstract readonly displayName: string;
|
||||
|
||||
abstract login(credentials: { email: string; password: string }): Promise<void>;
|
||||
abstract isSessionValid(): Promise<boolean>;
|
||||
abstract scrapePerformanceMetrics(): Promise<any>;
|
||||
abstract scrapeReservations(): Promise<any[]>;
|
||||
abstract scrapePricing(dateRange: { from: string; to: string }): Promise<any[]>;
|
||||
abstract previewPriceChanges(changes: any[]): Promise<any>;
|
||||
abstract applyPriceChanges(changes: any[]): Promise<any>;
|
||||
abstract selfTest(): Promise<{
|
||||
platformId: string;
|
||||
healthy: boolean;
|
||||
message: string;
|
||||
checkedAt: string;
|
||||
}>;
|
||||
}
|
||||
79
apps/scraper/src/adapters/mock/MockAdapter.ts
Normal file
79
apps/scraper/src/adapters/mock/MockAdapter.ts
Normal file
@@ -0,0 +1,79 @@
|
||||
import { PlatformAdapter } from '../base/PlatformAdapter.js';
|
||||
import {
|
||||
generatePerformanceSnapshot,
|
||||
generateReservations,
|
||||
generateDailyPrices,
|
||||
} from './mock-data.js';
|
||||
|
||||
export class MockAdapter extends PlatformAdapter {
|
||||
readonly platformId = 'mock';
|
||||
readonly displayName = 'Mock Platform';
|
||||
|
||||
async login(_credentials: { email: string; password: string }): Promise<void> {
|
||||
// No-op for mock adapter
|
||||
}
|
||||
|
||||
async isSessionValid(): Promise<boolean> {
|
||||
return true;
|
||||
}
|
||||
|
||||
async scrapePerformanceMetrics(): Promise<any> {
|
||||
return generatePerformanceSnapshot(this.platformId);
|
||||
}
|
||||
|
||||
async scrapeReservations(): Promise<any[]> {
|
||||
return generateReservations(this.platformId);
|
||||
}
|
||||
|
||||
async scrapePricing(dateRange: { from: string; to: string }): Promise<any[]> {
|
||||
return generateDailyPrices(this.platformId, dateRange.from, dateRange.to);
|
||||
}
|
||||
|
||||
async previewPriceChanges(changes: any[]): Promise<any> {
|
||||
return {
|
||||
platformId: this.platformId,
|
||||
previewedAt: new Date().toISOString(),
|
||||
changesCount: changes.length,
|
||||
changes: changes.map((c) => ({
|
||||
date: c.date,
|
||||
currentPrice: c.currentPrice ?? Math.round(Math.random() * 100 + 150),
|
||||
proposedPrice: c.newPrice,
|
||||
diff: c.newPrice - (c.currentPrice ?? 200),
|
||||
diffPercent: Number(
|
||||
(((c.newPrice - (c.currentPrice ?? 200)) / (c.currentPrice ?? 200)) * 100).toFixed(1),
|
||||
),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
async applyPriceChanges(changes: any[]): Promise<any> {
|
||||
// Simulate network delay
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
|
||||
return {
|
||||
platformId: this.platformId,
|
||||
appliedAt: new Date().toISOString(),
|
||||
success: true,
|
||||
appliedCount: changes.length,
|
||||
results: changes.map((c) => ({
|
||||
date: c.date,
|
||||
newPrice: c.newPrice,
|
||||
applied: true,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
async selfTest(): Promise<{
|
||||
platformId: string;
|
||||
healthy: boolean;
|
||||
message: string;
|
||||
checkedAt: string;
|
||||
}> {
|
||||
return {
|
||||
platformId: this.platformId,
|
||||
healthy: true,
|
||||
message: 'Mock adapter is operational',
|
||||
checkedAt: new Date().toISOString(),
|
||||
};
|
||||
}
|
||||
}
|
||||
151
apps/scraper/src/adapters/mock/mock-data.ts
Normal file
151
apps/scraper/src/adapters/mock/mock-data.ts
Normal file
@@ -0,0 +1,151 @@
|
||||
const GUEST_NAMES = [
|
||||
'Sarah Johnson',
|
||||
'Michael Chen',
|
||||
'Emily Rodriguez',
|
||||
'James Williams',
|
||||
'Olivia Martinez',
|
||||
'David Kim',
|
||||
'Sophia Brown',
|
||||
'Daniel Taylor',
|
||||
'Isabella Anderson',
|
||||
'Matthew Thomas',
|
||||
'Ava Wilson',
|
||||
'Christopher Lee',
|
||||
'Mia Garcia',
|
||||
'Andrew Jackson',
|
||||
'Charlotte White',
|
||||
];
|
||||
|
||||
const RESERVATION_STATUSES = ['confirmed', 'checked_in', 'completed', 'cancelled'] as const;
|
||||
|
||||
function randomBetween(min: number, max: number): number {
|
||||
return Math.floor(Math.random() * (max - min + 1)) + min;
|
||||
}
|
||||
|
||||
function randomFloat(min: number, max: number, decimals = 2): number {
|
||||
const val = Math.random() * (max - min) + min;
|
||||
return Number(val.toFixed(decimals));
|
||||
}
|
||||
|
||||
function randomItem<T>(arr: readonly T[]): T {
|
||||
return arr[Math.floor(Math.random() * arr.length)];
|
||||
}
|
||||
|
||||
function addDays(dateStr: string, days: number): string {
|
||||
const d = new Date(dateStr);
|
||||
d.setDate(d.getDate() + days);
|
||||
return d.toISOString().split('T')[0];
|
||||
}
|
||||
|
||||
function getDayOfWeek(dateStr: string): number {
|
||||
return new Date(dateStr).getDay();
|
||||
}
|
||||
|
||||
export function generatePerformanceSnapshot(platformId: string) {
|
||||
const viewsSearch = randomBetween(500, 2000);
|
||||
const viewsListing = randomBetween(Math.floor(viewsSearch * 0.3), Math.floor(viewsSearch * 0.7));
|
||||
const bookingsCount = randomBetween(5, 25);
|
||||
const conversionRate = randomFloat(1.5, 8.0);
|
||||
const occupancyRate = randomFloat(60, 85);
|
||||
const avgDailyRate = randomFloat(150, 250);
|
||||
const revenueTotal = randomFloat(
|
||||
avgDailyRate * bookingsCount * 2,
|
||||
avgDailyRate * bookingsCount * 5,
|
||||
);
|
||||
|
||||
return {
|
||||
platformId,
|
||||
capturedAt: new Date().toISOString(),
|
||||
periodLabel: 'last_30_days',
|
||||
viewsSearch,
|
||||
viewsListing,
|
||||
conversionRate,
|
||||
bookingsCount,
|
||||
occupancyRate,
|
||||
avgDailyRate,
|
||||
revenueTotal,
|
||||
rawJson: {
|
||||
source: 'mock',
|
||||
generatedAt: new Date().toISOString(),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export function generateReservations(platformId: string, count: number = randomBetween(5, 10)) {
|
||||
const reservations = [];
|
||||
const today = new Date().toISOString().split('T')[0];
|
||||
|
||||
for (let i = 0; i < count; i++) {
|
||||
const daysOffset = randomBetween(-30, 60);
|
||||
const checkIn = addDays(today, daysOffset);
|
||||
const nights = randomBetween(2, 7);
|
||||
const checkOut = addDays(checkIn, nights);
|
||||
const nightlyRate = randomFloat(150, 300);
|
||||
const cleaningFee = randomFloat(75, 150);
|
||||
const platformFee = randomFloat(nightlyRate * nights * 0.03, nightlyRate * nights * 0.05);
|
||||
const totalPayout = Number((nightlyRate * nights + cleaningFee - platformFee).toFixed(2));
|
||||
const guestsCount = randomBetween(1, 6);
|
||||
|
||||
let status: (typeof RESERVATION_STATUSES)[number];
|
||||
if (daysOffset < -7) {
|
||||
status = 'completed';
|
||||
} else if (daysOffset < 0) {
|
||||
status = 'checked_in';
|
||||
} else {
|
||||
status = Math.random() > 0.1 ? 'confirmed' : 'cancelled';
|
||||
}
|
||||
|
||||
const bookedDaysAgo = randomBetween(14, 90);
|
||||
|
||||
reservations.push({
|
||||
platformId,
|
||||
platformReservationId: `MOCK-${platformId.toUpperCase()}-${Date.now()}-${i}`,
|
||||
guestName: randomItem(GUEST_NAMES),
|
||||
checkIn,
|
||||
checkOut,
|
||||
nights,
|
||||
guestsCount,
|
||||
nightlyRate,
|
||||
cleaningFee,
|
||||
platformFee: Number(platformFee.toFixed(2)),
|
||||
totalPayout,
|
||||
status,
|
||||
bookedAt: new Date(Date.now() - bookedDaysAgo * 86400000).toISOString(),
|
||||
rawJson: {
|
||||
source: 'mock',
|
||||
generatedAt: new Date().toISOString(),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
return reservations.sort(
|
||||
(a, b) => new Date(a.checkIn).getTime() - new Date(b.checkIn).getTime(),
|
||||
);
|
||||
}
|
||||
|
||||
export function generateDailyPrices(platformId: string, from: string, to: string) {
|
||||
const prices = [];
|
||||
let current = from;
|
||||
|
||||
while (current <= to) {
|
||||
const dayOfWeek = getDayOfWeek(current);
|
||||
const isWeekend = dayOfWeek === 5 || dayOfWeek === 6;
|
||||
|
||||
const basePrice = isWeekend ? randomFloat(200, 300) : randomFloat(150, 220);
|
||||
const isAvailable = Math.random() > 0.15;
|
||||
const minStay = isWeekend ? randomBetween(2, 3) : 1;
|
||||
|
||||
prices.push({
|
||||
platformId,
|
||||
date: current,
|
||||
price: basePrice,
|
||||
isAvailable,
|
||||
minStayNights: minStay,
|
||||
syncedAt: new Date().toISOString(),
|
||||
});
|
||||
|
||||
current = addDays(current, 1);
|
||||
}
|
||||
|
||||
return prices;
|
||||
}
|
||||
48
apps/scraper/src/adapters/vrbo/VrboAdapter.ts
Normal file
48
apps/scraper/src/adapters/vrbo/VrboAdapter.ts
Normal file
@@ -0,0 +1,48 @@
|
||||
import { PlatformAdapter } from '../base/PlatformAdapter.js';
|
||||
|
||||
export class VrboAdapter extends PlatformAdapter {
|
||||
readonly platformId = 'vrbo';
|
||||
readonly displayName = 'VRBO';
|
||||
|
||||
async login(_credentials: { email: string; password: string }): Promise<void> {
|
||||
throw new Error('VRBO adapter not yet implemented');
|
||||
}
|
||||
|
||||
async isSessionValid(): Promise<boolean> {
|
||||
throw new Error('VRBO adapter not yet implemented');
|
||||
}
|
||||
|
||||
async scrapePerformanceMetrics(): Promise<any> {
|
||||
throw new Error('VRBO adapter not yet implemented');
|
||||
}
|
||||
|
||||
async scrapeReservations(): Promise<any[]> {
|
||||
throw new Error('VRBO adapter not yet implemented');
|
||||
}
|
||||
|
||||
async scrapePricing(_dateRange: { from: string; to: string }): Promise<any[]> {
|
||||
throw new Error('VRBO adapter not yet implemented');
|
||||
}
|
||||
|
||||
async previewPriceChanges(_changes: any[]): Promise<any> {
|
||||
throw new Error('VRBO adapter not yet implemented');
|
||||
}
|
||||
|
||||
async applyPriceChanges(_changes: any[]): Promise<any> {
|
||||
throw new Error('VRBO adapter not yet implemented');
|
||||
}
|
||||
|
||||
async selfTest(): Promise<{
|
||||
platformId: string;
|
||||
healthy: boolean;
|
||||
message: string;
|
||||
checkedAt: string;
|
||||
}> {
|
||||
return {
|
||||
platformId: this.platformId,
|
||||
healthy: false,
|
||||
message: 'Not implemented',
|
||||
checkedAt: new Date().toISOString(),
|
||||
};
|
||||
}
|
||||
}
|
||||
20
apps/scraper/src/adapters/vrbo/vrbo.flows.ts
Normal file
20
apps/scraper/src/adapters/vrbo/vrbo.flows.ts
Normal file
@@ -0,0 +1,20 @@
|
||||
import type { Page } from 'playwright';
|
||||
|
||||
export async function loginFlow(_page: Page, _email: string, _password: string): Promise<void> {
|
||||
throw new Error('VRBO login flow not yet implemented');
|
||||
}
|
||||
|
||||
export async function scrapePerformanceFlow(_page: Page): Promise<any> {
|
||||
throw new Error('VRBO scrapePerformance flow not yet implemented');
|
||||
}
|
||||
|
||||
export async function scrapePricingFlow(
|
||||
_page: Page,
|
||||
_dateRange: { from: string; to: string },
|
||||
): Promise<any[]> {
|
||||
throw new Error('VRBO scrapePricing flow not yet implemented');
|
||||
}
|
||||
|
||||
export async function scrapeReservationsFlow(_page: Page): Promise<any[]> {
|
||||
throw new Error('VRBO scrapeReservations flow not yet implemented');
|
||||
}
|
||||
18
apps/scraper/src/adapters/vrbo/vrbo.selectors.ts
Normal file
18
apps/scraper/src/adapters/vrbo/vrbo.selectors.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
export const SELECTORS = {
|
||||
LOGIN_EMAIL: '',
|
||||
LOGIN_PASSWORD: '',
|
||||
LOGIN_SUBMIT: '',
|
||||
DASHBOARD_NAV: '',
|
||||
PERFORMANCE_TAB: '',
|
||||
RESERVATIONS_TAB: '',
|
||||
PRICING_TAB: '',
|
||||
DATE_PICKER_FROM: '',
|
||||
DATE_PICKER_TO: '',
|
||||
METRICS_CONTAINER: '',
|
||||
RESERVATIONS_TABLE: '',
|
||||
PRICING_CALENDAR: '',
|
||||
PRICE_INPUT: '',
|
||||
SAVE_PRICE_BUTTON: '',
|
||||
NEXT_PAGE_BUTTON: '',
|
||||
LOADING_SPINNER: '',
|
||||
} as const;
|
||||
270
apps/scraper/src/index.ts
Normal file
270
apps/scraper/src/index.ts
Normal file
@@ -0,0 +1,270 @@
|
||||
import dotenv from 'dotenv';
|
||||
import { fileURLToPath } from 'url';
|
||||
import { dirname, resolve } from 'path';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
dotenv.config({ path: resolve(__dirname, '../../../.env') });
|
||||
|
||||
import Fastify from 'fastify';
|
||||
import { registry } from './adapters/base/AdapterRegistry.js';
|
||||
import { MockAdapter } from './adapters/mock/MockAdapter.js';
|
||||
import { AirbnbAdapter } from './adapters/airbnb/AirbnbAdapter.js';
|
||||
import { VrboAdapter } from './adapters/vrbo/VrboAdapter.js';
|
||||
import { jobQueue, type JobType } from './queue/jobQueue.js';
|
||||
import { startWorker } from './queue/worker.js';
|
||||
|
||||
// Register adapters
|
||||
registry.register(new MockAdapter());
|
||||
registry.register(new AirbnbAdapter());
|
||||
registry.register(new VrboAdapter());
|
||||
|
||||
const app = Fastify({ logger: true });
|
||||
|
||||
// ── Health Check ──────────────────────────────────────────────────────────────
|
||||
|
||||
app.get('/health', async () => {
|
||||
return {
|
||||
status: 'ok',
|
||||
service: 'scraper',
|
||||
timestamp: new Date().toISOString(),
|
||||
adapters: registry.list(),
|
||||
};
|
||||
});
|
||||
|
||||
// ── Create Scrape Job ─────────────────────────────────────────────────────────
|
||||
|
||||
app.post<{
|
||||
Body: { platformId: string; jobType: JobType; triggeredBy: string };
|
||||
}>('/jobs', async (request, reply) => {
|
||||
const { platformId, jobType, triggeredBy } = request.body;
|
||||
|
||||
if (!platformId || !jobType || !triggeredBy) {
|
||||
return reply.status(400).send({ error: 'Missing required fields: platformId, jobType, triggeredBy' });
|
||||
}
|
||||
|
||||
if (!registry.has(platformId)) {
|
||||
return reply.status(400).send({ error: `Unknown platform: ${platformId}` });
|
||||
}
|
||||
|
||||
const validJobTypes: JobType[] = ['performance', 'reservations', 'pricing', 'full'];
|
||||
if (!validJobTypes.includes(jobType)) {
|
||||
return reply.status(400).send({ error: `Invalid jobType. Must be one of: ${validJobTypes.join(', ')}` });
|
||||
}
|
||||
|
||||
const job = jobQueue.enqueue({ platformId, jobType, triggeredBy });
|
||||
return reply.status(201).send(job);
|
||||
});
|
||||
|
||||
// ── Get Job Status ────────────────────────────────────────────────────────────
|
||||
|
||||
app.get<{
|
||||
Params: { id: string };
|
||||
}>('/jobs/:id', async (request, reply) => {
|
||||
const job = jobQueue.getJob(request.params.id);
|
||||
if (!job) {
|
||||
return reply.status(404).send({ error: 'Job not found' });
|
||||
}
|
||||
return job;
|
||||
});
|
||||
|
||||
// ── Platform Self-Test ────────────────────────────────────────────────────────
|
||||
|
||||
app.post<{
|
||||
Params: { id: string };
|
||||
}>('/platforms/:id/test', async (request, reply) => {
|
||||
const adapter = registry.get(request.params.id);
|
||||
if (!adapter) {
|
||||
return reply.status(404).send({ error: `Unknown platform: ${request.params.id}` });
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await adapter.selfTest();
|
||||
return result;
|
||||
} catch (err: any) {
|
||||
return reply.status(500).send({
|
||||
platformId: request.params.id,
|
||||
healthy: false,
|
||||
message: err.message,
|
||||
checkedAt: new Date().toISOString(),
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// ── Platform Login ───────────────────────────────────────────────────────────
|
||||
|
||||
app.post<{
|
||||
Params: { id: string };
|
||||
Body: { email?: string; password?: string };
|
||||
}>('/platforms/:id/login', async (request, reply) => {
|
||||
const adapter = registry.get(request.params.id);
|
||||
if (!adapter) {
|
||||
return reply.status(404).send({ error: `Unknown platform: ${request.params.id}` });
|
||||
}
|
||||
|
||||
const email = request.body?.email || process.env[`${request.params.id.toUpperCase()}_EMAIL`] || '';
|
||||
const password = request.body?.password || process.env[`${request.params.id.toUpperCase()}_PASSWORD`] || '';
|
||||
|
||||
if (!email || !password) {
|
||||
return reply.status(400).send({
|
||||
error: `Missing credentials. Provide email/password in body or set ${request.params.id.toUpperCase()}_EMAIL and ${request.params.id.toUpperCase()}_PASSWORD env vars.`,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
await adapter.login({ email, password });
|
||||
return {
|
||||
platformId: request.params.id,
|
||||
status: 'logged_in',
|
||||
message: 'Login successful. Session cookies saved.',
|
||||
at: new Date().toISOString(),
|
||||
};
|
||||
} catch (err: any) {
|
||||
return reply.status(500).send({
|
||||
platformId: request.params.id,
|
||||
status: 'login_failed',
|
||||
message: err.message,
|
||||
at: new Date().toISOString(),
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// ── Platform Session Check ──────────────────────────────────────────────────
|
||||
|
||||
app.get<{
|
||||
Params: { id: string };
|
||||
}>('/platforms/:id/session', async (request, reply) => {
|
||||
const adapter = registry.get(request.params.id);
|
||||
if (!adapter) {
|
||||
return reply.status(404).send({ error: `Unknown platform: ${request.params.id}` });
|
||||
}
|
||||
|
||||
try {
|
||||
const valid = await adapter.isSessionValid();
|
||||
return {
|
||||
platformId: request.params.id,
|
||||
sessionValid: valid,
|
||||
checkedAt: new Date().toISOString(),
|
||||
};
|
||||
} catch (err: any) {
|
||||
return reply.status(500).send({
|
||||
platformId: request.params.id,
|
||||
sessionValid: false,
|
||||
error: err.message,
|
||||
checkedAt: new Date().toISOString(),
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// ── Debug DOM Inspection ────────────────────────────────────────────────────
|
||||
|
||||
app.get<{
|
||||
Params: { id: string };
|
||||
Querystring: { url: string };
|
||||
}>('/platforms/:id/debug-dom', async (request, reply) => {
|
||||
const adapter = registry.get(request.params.id) as any;
|
||||
if (!adapter) {
|
||||
return reply.status(404).send({ error: `Unknown platform: ${request.params.id}` });
|
||||
}
|
||||
|
||||
const url = (request.query as any).url;
|
||||
if (!url) {
|
||||
return reply.status(400).send({ error: 'Provide ?url= parameter' });
|
||||
}
|
||||
|
||||
try {
|
||||
const page = await adapter.ensureBrowser();
|
||||
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
const result = await page.evaluate(() => {
|
||||
// Table structure
|
||||
const headers: string[] = [];
|
||||
document.querySelectorAll('table th, table thead td').forEach((th: any) => {
|
||||
headers.push(th.textContent?.trim() || '');
|
||||
});
|
||||
|
||||
const rows: string[][] = [];
|
||||
document.querySelectorAll('table tbody tr, table tr').forEach((tr: any) => {
|
||||
const cells: string[] = [];
|
||||
tr.querySelectorAll('td').forEach((td: any) => {
|
||||
cells.push(td.textContent?.trim().replace(/\n/g, ' | ') || '');
|
||||
});
|
||||
if (cells.length > 0) rows.push(cells);
|
||||
});
|
||||
|
||||
// data-testid values
|
||||
const testIds = new Set<string>();
|
||||
document.querySelectorAll('[data-testid]').forEach(el => {
|
||||
testIds.add(el.getAttribute('data-testid') || '');
|
||||
});
|
||||
|
||||
// Price text
|
||||
const priceTexts: string[] = [];
|
||||
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
|
||||
while (walker.nextNode()) {
|
||||
const text = walker.currentNode.textContent?.trim() || '';
|
||||
if (/^\$\d+$/.test(text)) priceTexts.push(text);
|
||||
}
|
||||
|
||||
// All links with numeric IDs
|
||||
const links: { href: string; text: string }[] = [];
|
||||
document.querySelectorAll('a[href]').forEach((a: any) => {
|
||||
const href = a.getAttribute('href') || '';
|
||||
if (/\/\d{5,}/.test(href)) {
|
||||
links.push({ href, text: a.textContent?.trim().substring(0, 100) || '' });
|
||||
}
|
||||
});
|
||||
|
||||
// Main text
|
||||
const mainText = document.querySelector('main')?.textContent?.substring(0, 2000) || '';
|
||||
|
||||
return { headers, rows: rows.slice(0, 5), testIds: Array.from(testIds), priceTexts: priceTexts.slice(0, 20), links, mainText };
|
||||
});
|
||||
|
||||
return reply.send({ url, ...result });
|
||||
} catch (err: any) {
|
||||
return reply.status(500).send({ error: err.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ── Apply Price Changes ───────────────────────────────────────────────────────
|
||||
|
||||
app.post<{
|
||||
Params: { id: string };
|
||||
Body: { changes: any[] };
|
||||
}>('/platforms/:id/price-apply', async (request, reply) => {
|
||||
const adapter = registry.get(request.params.id);
|
||||
if (!adapter) {
|
||||
return reply.status(404).send({ error: `Unknown platform: ${request.params.id}` });
|
||||
}
|
||||
|
||||
const { changes } = request.body;
|
||||
if (!changes || !Array.isArray(changes)) {
|
||||
return reply.status(400).send({ error: 'Request body must include a changes array' });
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await adapter.applyPriceChanges(changes);
|
||||
return result;
|
||||
} catch (err: any) {
|
||||
return reply.status(500).send({ error: err.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ── Start Server ──────────────────────────────────────────────────────────────
|
||||
|
||||
const PORT = Number(process.env.SCRAPER_PORT) || 3001;
|
||||
|
||||
async function start() {
|
||||
try {
|
||||
startWorker();
|
||||
await app.listen({ port: PORT, host: '0.0.0.0' });
|
||||
console.log(`Scraper service running on port ${PORT}`);
|
||||
} catch (err) {
|
||||
app.log.error(err);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
start();
|
||||
78
apps/scraper/src/queue/jobQueue.ts
Normal file
78
apps/scraper/src/queue/jobQueue.ts
Normal file
@@ -0,0 +1,78 @@
|
||||
import { randomUUID } from 'node:crypto';
|
||||
|
||||
export type JobStatus = 'pending' | 'running' | 'completed' | 'failed';
|
||||
export type JobType = 'performance' | 'reservations' | 'pricing' | 'full';
|
||||
|
||||
export interface Job {
|
||||
id: string;
|
||||
platformId: string;
|
||||
jobType: JobType;
|
||||
triggeredBy: string;
|
||||
status: JobStatus;
|
||||
createdAt: string;
|
||||
startedAt: string | null;
|
||||
completedAt: string | null;
|
||||
errorMessage: string | null;
|
||||
rowsCollected: number | null;
|
||||
result: any | null;
|
||||
}
|
||||
|
||||
class JobQueue {
|
||||
private jobs = new Map<string, Job>();
|
||||
private pending: string[] = [];
|
||||
|
||||
enqueue(params: { platformId: string; jobType: JobType; triggeredBy: string }): Job {
|
||||
const job: Job = {
|
||||
id: randomUUID(),
|
||||
platformId: params.platformId,
|
||||
jobType: params.jobType,
|
||||
triggeredBy: params.triggeredBy,
|
||||
status: 'pending',
|
||||
createdAt: new Date().toISOString(),
|
||||
startedAt: null,
|
||||
completedAt: null,
|
||||
errorMessage: null,
|
||||
rowsCollected: null,
|
||||
result: null,
|
||||
};
|
||||
|
||||
this.jobs.set(job.id, job);
|
||||
this.pending.push(job.id);
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
dequeue(): Job | undefined {
|
||||
const id = this.pending.shift();
|
||||
if (!id) return undefined;
|
||||
|
||||
const job = this.jobs.get(id);
|
||||
if (job) {
|
||||
job.status = 'running';
|
||||
job.startedAt = new Date().toISOString();
|
||||
}
|
||||
return job;
|
||||
}
|
||||
|
||||
getJob(id: string): Job | undefined {
|
||||
return this.jobs.get(id);
|
||||
}
|
||||
|
||||
updateJob(id: string, updates: Partial<Pick<Job, 'status' | 'completedAt' | 'errorMessage' | 'rowsCollected' | 'result'>>): Job | undefined {
|
||||
const job = this.jobs.get(id);
|
||||
if (!job) return undefined;
|
||||
|
||||
Object.assign(job, updates);
|
||||
return job;
|
||||
}
|
||||
|
||||
pendingCount(): number {
|
||||
return this.pending.length;
|
||||
}
|
||||
|
||||
listJobs(): Job[] {
|
||||
return Array.from(this.jobs.values());
|
||||
}
|
||||
}
|
||||
|
||||
export const jobQueue = new JobQueue();
|
||||
288
apps/scraper/src/queue/worker.ts
Normal file
288
apps/scraper/src/queue/worker.ts
Normal file
@@ -0,0 +1,288 @@
|
||||
import postgres from 'postgres';
|
||||
import { jobQueue, type Job } from './jobQueue.js';
|
||||
import { registry } from '../adapters/base/AdapterRegistry.js';
|
||||
|
||||
// Safely convert a value to a valid ISO date string, or return fallback
|
||||
function safeDate(val: any, fallback?: string): string {
|
||||
if (!val) return fallback || new Date().toISOString();
|
||||
try {
|
||||
const d = new Date(val);
|
||||
if (isNaN(d.getTime())) return fallback || new Date().toISOString();
|
||||
return d.toISOString();
|
||||
} catch {
|
||||
return fallback || new Date().toISOString();
|
||||
}
|
||||
}
|
||||
|
||||
// Safely convert to a YYYY-MM-DD date string
|
||||
function safeDateOnly(val: any, fallback?: string): string {
|
||||
if (!val) return fallback || new Date().toISOString().split('T')[0];
|
||||
try {
|
||||
const d = new Date(val);
|
||||
if (isNaN(d.getTime())) return fallback || new Date().toISOString().split('T')[0];
|
||||
return d.toISOString().split('T')[0];
|
||||
} catch {
|
||||
return fallback || new Date().toISOString().split('T')[0];
|
||||
}
|
||||
}
|
||||
|
||||
// Lazy-init raw postgres connection (tagged template = auto-parameterized)
|
||||
let sql: ReturnType<typeof postgres> | null = null;
|
||||
|
||||
function getSql() {
|
||||
if (!sql) {
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
if (!connectionString) {
|
||||
console.warn('[worker] DATABASE_URL not set - DB writes will be skipped');
|
||||
return null;
|
||||
}
|
||||
sql = postgres(connectionString);
|
||||
}
|
||||
return sql;
|
||||
}
|
||||
|
||||
async function processJob(job: Job): Promise<void> {
|
||||
const adapter = registry.getOrThrow(job.platformId);
|
||||
|
||||
let result: any;
|
||||
let rowsCollected = 0;
|
||||
|
||||
switch (job.jobType) {
|
||||
case 'performance': {
|
||||
result = await adapter.scrapePerformanceMetrics();
|
||||
rowsCollected = 1;
|
||||
await persistPerformanceSnapshot(result);
|
||||
break;
|
||||
}
|
||||
case 'reservations': {
|
||||
result = await adapter.scrapeReservations();
|
||||
rowsCollected = result.length;
|
||||
await persistReservations(result);
|
||||
break;
|
||||
}
|
||||
case 'pricing': {
|
||||
const today = new Date();
|
||||
const from = today.toISOString().split('T')[0];
|
||||
const toDate = new Date(today);
|
||||
toDate.setDate(toDate.getDate() + 90);
|
||||
const to = toDate.toISOString().split('T')[0];
|
||||
result = await adapter.scrapePricing({ from, to });
|
||||
rowsCollected = result.length;
|
||||
await persistDailyPrices(result);
|
||||
break;
|
||||
}
|
||||
case 'full': {
|
||||
const perfResult = await adapter.scrapePerformanceMetrics();
|
||||
await persistPerformanceSnapshot(perfResult);
|
||||
|
||||
const reservationsResult = await adapter.scrapeReservations();
|
||||
await persistReservations(reservationsResult);
|
||||
|
||||
const todayFull = new Date();
|
||||
const fromFull = todayFull.toISOString().split('T')[0];
|
||||
const toDateFull = new Date(todayFull);
|
||||
toDateFull.setDate(toDateFull.getDate() + 90);
|
||||
const toFull = toDateFull.toISOString().split('T')[0];
|
||||
const pricingResult = await adapter.scrapePricing({ from: fromFull, to: toFull });
|
||||
await persistDailyPrices(pricingResult);
|
||||
|
||||
rowsCollected = 1 + reservationsResult.length + pricingResult.length;
|
||||
result = {
|
||||
performance: perfResult,
|
||||
reservations: reservationsResult,
|
||||
pricing: pricingResult,
|
||||
};
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw new Error(`Unknown job type: ${job.jobType}`);
|
||||
}
|
||||
|
||||
jobQueue.updateJob(job.id, {
|
||||
status: 'completed',
|
||||
completedAt: new Date().toISOString(),
|
||||
rowsCollected,
|
||||
result,
|
||||
});
|
||||
|
||||
await persistJobStatus(job.id, 'completed', rowsCollected);
|
||||
}
|
||||
|
||||
async function persistPerformanceSnapshot(snapshot: any): Promise<void> {
|
||||
const db = getSql();
|
||||
if (!db) return;
|
||||
|
||||
try {
|
||||
await db`
|
||||
INSERT INTO performance_snapshots (platform_id, captured_at, period_label, views_search, views_listing, conversion_rate, bookings_count, occupancy_rate, avg_daily_rate, revenue_total, raw_json)
|
||||
VALUES (
|
||||
${snapshot.platformId},
|
||||
${snapshot.capturedAt},
|
||||
${snapshot.periodLabel},
|
||||
${snapshot.viewsSearch},
|
||||
${snapshot.viewsListing},
|
||||
${snapshot.conversionRate},
|
||||
${snapshot.bookingsCount},
|
||||
${snapshot.occupancyRate},
|
||||
${snapshot.avgDailyRate},
|
||||
${snapshot.revenueTotal},
|
||||
${JSON.stringify(snapshot.rawJson)}
|
||||
)
|
||||
`;
|
||||
} catch (err) {
|
||||
console.error('[worker] Failed to persist performance snapshot:', err);
|
||||
}
|
||||
}
|
||||
|
||||
async function persistReservations(reservations: any[]): Promise<void> {
|
||||
const db = getSql();
|
||||
if (!db) return;
|
||||
|
||||
for (const r of reservations) {
|
||||
try {
|
||||
const checkIn = safeDateOnly(r.checkIn);
|
||||
const checkOut = safeDateOnly(r.checkOut);
|
||||
const bookedAt = safeDate(r.bookedAt);
|
||||
const nights = isNaN(Number(r.nights)) ? 1 : Number(r.nights);
|
||||
const guestsCount = isNaN(Number(r.guestsCount)) ? 1 : Number(r.guestsCount);
|
||||
const nightlyRate = isNaN(Number(r.nightlyRate)) ? 0 : Number(r.nightlyRate);
|
||||
const cleaningFee = isNaN(Number(r.cleaningFee)) ? 0 : Number(r.cleaningFee);
|
||||
const platformFee = isNaN(Number(r.platformFee)) ? 0 : Number(r.platformFee);
|
||||
const totalPayout = isNaN(Number(r.totalPayout)) ? 0 : Number(r.totalPayout);
|
||||
|
||||
await db`
|
||||
INSERT INTO reservations (platform_id, platform_reservation_id, guest_name, check_in, check_out, nights, guests_count, nightly_rate, cleaning_fee, platform_fee, total_payout, status, booked_at, raw_json)
|
||||
VALUES (
|
||||
${r.platformId},
|
||||
${r.platformReservationId},
|
||||
${r.guestName},
|
||||
${checkIn},
|
||||
${checkOut},
|
||||
${nights},
|
||||
${guestsCount},
|
||||
${nightlyRate},
|
||||
${cleaningFee},
|
||||
${platformFee},
|
||||
${totalPayout},
|
||||
${r.status},
|
||||
${bookedAt},
|
||||
${JSON.stringify(r.rawJson)}
|
||||
)
|
||||
ON CONFLICT (platform_id, platform_reservation_id) DO UPDATE SET
|
||||
guest_name = EXCLUDED.guest_name,
|
||||
check_in = EXCLUDED.check_in,
|
||||
check_out = EXCLUDED.check_out,
|
||||
nights = EXCLUDED.nights,
|
||||
guests_count = EXCLUDED.guests_count,
|
||||
nightly_rate = EXCLUDED.nightly_rate,
|
||||
cleaning_fee = EXCLUDED.cleaning_fee,
|
||||
platform_fee = EXCLUDED.platform_fee,
|
||||
total_payout = EXCLUDED.total_payout,
|
||||
status = EXCLUDED.status,
|
||||
synced_at = NOW()
|
||||
`;
|
||||
} catch (err) {
|
||||
console.error('[worker] Failed to persist reservation:', err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function persistDailyPrices(prices: any[]): Promise<void> {
|
||||
const db = getSql();
|
||||
if (!db) return;
|
||||
|
||||
for (const p of prices) {
|
||||
try {
|
||||
await db`
|
||||
INSERT INTO daily_prices (platform_id, date, price, is_available, min_stay_nights, synced_at)
|
||||
VALUES (
|
||||
${p.platformId},
|
||||
${p.date},
|
||||
${p.price},
|
||||
${p.isAvailable},
|
||||
${p.minStayNights},
|
||||
${p.syncedAt}
|
||||
)
|
||||
ON CONFLICT (platform_id, date) DO UPDATE SET
|
||||
price = EXCLUDED.price,
|
||||
is_available = EXCLUDED.is_available,
|
||||
min_stay_nights = EXCLUDED.min_stay_nights,
|
||||
synced_at = NOW()
|
||||
`;
|
||||
} catch (err) {
|
||||
console.error('[worker] Failed to persist daily price:', err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function persistJobStatus(jobId: string, status: string, rowsCollected: number): Promise<void> {
|
||||
const db = getSql();
|
||||
if (!db) return;
|
||||
|
||||
try {
|
||||
await db`
|
||||
UPDATE scrape_jobs SET status = ${status}, completed_at = NOW(), rows_collected = ${rowsCollected}
|
||||
WHERE id = ${jobId}::uuid
|
||||
`;
|
||||
} catch (err) {
|
||||
// Job might not be in DB (e.g., in-memory only mode)
|
||||
console.warn('[worker] Could not update job in DB:', err);
|
||||
}
|
||||
}
|
||||
|
||||
let polling = false;
|
||||
let pollInterval: ReturnType<typeof setInterval> | null = null;
|
||||
|
||||
async function poll(): Promise<void> {
|
||||
if (polling) return;
|
||||
polling = true;
|
||||
|
||||
try {
|
||||
const job = jobQueue.dequeue();
|
||||
if (!job) return;
|
||||
|
||||
console.log(`[worker] Processing job ${job.id} (${job.jobType} for ${job.platformId})`);
|
||||
|
||||
try {
|
||||
await processJob(job);
|
||||
console.log(`[worker] Job ${job.id} completed`);
|
||||
} catch (err: any) {
|
||||
console.error(`[worker] Job ${job.id} failed:`, err.message);
|
||||
jobQueue.updateJob(job.id, {
|
||||
status: 'failed',
|
||||
completedAt: new Date().toISOString(),
|
||||
errorMessage: err.message,
|
||||
});
|
||||
|
||||
const db = getSql();
|
||||
if (db) {
|
||||
try {
|
||||
await db`
|
||||
UPDATE scrape_jobs SET status = 'failed', completed_at = NOW(), error_message = ${err.message}
|
||||
WHERE id = ${job.id}::uuid
|
||||
`;
|
||||
} catch {
|
||||
// Ignore DB errors for job status
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
polling = false;
|
||||
}
|
||||
}
|
||||
|
||||
export function startWorker(intervalMs = 2000): void {
|
||||
if (pollInterval) return;
|
||||
console.log(`[worker] Starting worker (polling every ${intervalMs}ms)`);
|
||||
pollInterval = setInterval(poll, intervalMs);
|
||||
// Run once immediately
|
||||
poll();
|
||||
}
|
||||
|
||||
export function stopWorker(): void {
|
||||
if (pollInterval) {
|
||||
clearInterval(pollInterval);
|
||||
pollInterval = null;
|
||||
console.log('[worker] Worker stopped');
|
||||
}
|
||||
}
|
||||
54
apps/scraper/src/utils/browser.ts
Normal file
54
apps/scraper/src/utils/browser.ts
Normal file
@@ -0,0 +1,54 @@
|
||||
import { chromium, type Browser } from 'playwright';
|
||||
|
||||
const DEFAULT_USER_AGENT =
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36';
|
||||
|
||||
export interface BrowserOptions {
|
||||
headless?: boolean;
|
||||
userAgent?: string;
|
||||
viewportWidth?: number;
|
||||
viewportHeight?: number;
|
||||
}
|
||||
|
||||
export async function createBrowser(options: BrowserOptions = {}): Promise<Browser> {
|
||||
const {
|
||||
headless = true,
|
||||
userAgent = DEFAULT_USER_AGENT,
|
||||
viewportWidth = 1920,
|
||||
viewportHeight = 1080,
|
||||
} = options;
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless,
|
||||
args: [
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-features=IsolateOrigins,site-per-process',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
`--window-size=${viewportWidth},${viewportHeight}`,
|
||||
],
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent,
|
||||
viewport: { width: viewportWidth, height: viewportHeight },
|
||||
locale: 'en-US',
|
||||
timezoneId: 'America/New_York',
|
||||
permissions: [],
|
||||
javaScriptEnabled: true,
|
||||
});
|
||||
|
||||
// Remove the webdriver flag to avoid detection
|
||||
await context.addInitScript(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||
});
|
||||
|
||||
// Close the default context page - callers use context.newPage()
|
||||
const pages = context.pages();
|
||||
if (pages.length > 0) {
|
||||
await pages[0].close();
|
||||
}
|
||||
|
||||
return browser;
|
||||
}
|
||||
8
apps/scraper/src/utils/delay.ts
Normal file
8
apps/scraper/src/utils/delay.ts
Normal file
@@ -0,0 +1,8 @@
|
||||
/**
|
||||
* Returns a promise that resolves after a random delay between min and max milliseconds.
|
||||
* Useful for mimicking human-like timing in browser automation.
|
||||
*/
|
||||
export function randomDelay(min: number, max: number): Promise<void> {
|
||||
const ms = Math.floor(Math.random() * (max - min + 1)) + min;
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
62
apps/scraper/src/utils/encryption.ts
Normal file
62
apps/scraper/src/utils/encryption.ts
Normal file
@@ -0,0 +1,62 @@
|
||||
import { createCipheriv, createDecipheriv, randomBytes } from 'node:crypto';
|
||||
|
||||
const ALGORITHM = 'aes-256-gcm';
|
||||
const IV_LENGTH = 12;
|
||||
const TAG_LENGTH = 16;
|
||||
const ENCODING = 'base64' as const;
|
||||
|
||||
function getEncryptionKey(): Buffer {
|
||||
const key = process.env.ENCRYPTION_KEY;
|
||||
if (!key) {
|
||||
throw new Error('ENCRYPTION_KEY environment variable is not set');
|
||||
}
|
||||
const keyBuffer = Buffer.from(key, ENCODING);
|
||||
if (keyBuffer.length !== 32) {
|
||||
throw new Error('ENCRYPTION_KEY must be exactly 32 bytes (base64-encoded)');
|
||||
}
|
||||
return keyBuffer;
|
||||
}
|
||||
|
||||
export function encrypt(plaintext: string): string {
|
||||
const key = getEncryptionKey();
|
||||
const iv = randomBytes(IV_LENGTH);
|
||||
const cipher = createCipheriv(ALGORITHM, key, iv);
|
||||
|
||||
let encrypted = cipher.update(plaintext, 'utf8');
|
||||
encrypted = Buffer.concat([encrypted, cipher.final()]);
|
||||
const tag = cipher.getAuthTag();
|
||||
|
||||
// Format: iv:tag:ciphertext (all base64)
|
||||
return [
|
||||
iv.toString(ENCODING),
|
||||
tag.toString(ENCODING),
|
||||
encrypted.toString(ENCODING),
|
||||
].join(':');
|
||||
}
|
||||
|
||||
export function decrypt(encryptedStr: string): string {
|
||||
const key = getEncryptionKey();
|
||||
const parts = encryptedStr.split(':');
|
||||
if (parts.length !== 3) {
|
||||
throw new Error('Invalid encrypted string format');
|
||||
}
|
||||
|
||||
const iv = Buffer.from(parts[0], ENCODING);
|
||||
const tag = Buffer.from(parts[1], ENCODING);
|
||||
const encrypted = Buffer.from(parts[2], ENCODING);
|
||||
|
||||
if (iv.length !== IV_LENGTH) {
|
||||
throw new Error('Invalid IV length');
|
||||
}
|
||||
if (tag.length !== TAG_LENGTH) {
|
||||
throw new Error('Invalid auth tag length');
|
||||
}
|
||||
|
||||
const decipher = createDecipheriv(ALGORITHM, key, iv);
|
||||
decipher.setAuthTag(tag);
|
||||
|
||||
let decrypted = decipher.update(encrypted);
|
||||
decrypted = Buffer.concat([decrypted, decipher.final()]);
|
||||
|
||||
return decrypted.toString('utf8');
|
||||
}
|
||||
8
apps/scraper/tsconfig.json
Normal file
8
apps/scraper/tsconfig.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"extends": "../../tsconfig.base.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "./dist",
|
||||
"rootDir": "./src"
|
||||
},
|
||||
"include": ["src"]
|
||||
}
|
||||
Reference in New Issue
Block a user