From c12ad94b7fcdebe378ecbc0ea53af9f98dc539bb Mon Sep 17 00:00:00 2001 From: olsch01 Date: Tue, 3 Mar 2026 10:44:58 -0500 Subject: [PATCH] fix: rewrite Bankrate scraper to extract actual bank names from offer cards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous scraper was picking up Bankrate's summary table (.wealth-product-rate-list) which only has "best rates" per term with no bank names, resulting in entries like "Top CD Rate - 1 year". Now targets the actual bank offer cards in .wrt-RateSections-sponsoredoffers and .wrt-RateSections-additionaloffers sections. Key changes: - Extract bank names from img[alt] (logo) with text-based fallbacks - Fix APY parsing to avoid Bankrate score leaking in (e.g. "4.5" score concatenated with "4.00%" APY was parsed as 0.4%) - Handle both "Min. deposit" (CDs) and "Min. balance for APY" (savings/MM) - Parse abbreviated terms from Bankrate (e.g. "1yr", "14mo") - Strip product suffixes from bank names (e.g. "Synchrony Bank CD" → "Synchrony Bank") - Filter out entries that aren't real banks (terms, dollar amounts) - Keep a fallback strategy for future Bankrate layout changes Co-Authored-By: Claude Opus 4.6 --- scripts/fetch-cd-rates.ts | 268 ++++++++++++++++++++------------------ 1 file changed, 141 insertions(+), 127 deletions(-) diff --git a/scripts/fetch-cd-rates.ts b/scripts/fetch-cd-rates.ts index 09e1a42..198b85f 100644 --- a/scripts/fetch-cd-rates.ts +++ b/scripts/fetch-cd-rates.ts @@ -61,12 +61,10 @@ interface MarketRate { */ function parseTermMonths(term: string): number | null { const lower = term.toLowerCase().trim(); - const monthMatch = lower.match(/(\d+)\s*month/); + const monthMatch = lower.match(/(\d+)\s*mo(?:nth)?/); if (monthMatch) return parseInt(monthMatch[1], 10); - const yearMatch = lower.match(/(\d+)\s*year/); - if (yearMatch) return parseInt(yearMatch[1], 10) * 12; - // Handle fractional years like "1.5 years" - const fracYearMatch = lower.match(/([\d.]+)\s*year/); + // Handle fractional years like "1.5 years" or "1.5 yr" + const fracYearMatch = lower.match(/([\d.]+)\s*y(?:ear|r)/); if (fracYearMatch) return Math.round(parseFloat(fracYearMatch[1]) * 12); return null; } @@ -84,10 +82,14 @@ function parseMinDeposit(raw: string): number | null { /** * Parse an APY string like "4.50%", "4.50% APY" into a number. + * Handles edge cases like ".4.50%" (leading period from adjacent text). */ function parseApy(raw: string): number { - const cleaned = raw.replace(/[^0-9.]/g, ''); - return parseFloat(cleaned) || 0; + // Extract the first valid decimal number (digit-leading) from the string + const match = raw.match(/(\d+\.?\d*)/); + if (!match) return 0; + const val = parseFloat(match[1]); + return isNaN(val) ? 0 : val; } /** @@ -98,8 +100,20 @@ function sleep(ms: number): Promise { } /** - * Navigate to a Bankrate URL and scrape rate data. - * Reuses an existing browser instance. + * Navigate to a Bankrate URL and scrape rate data from individual bank offer cards. + * + * Bankrate uses a card-based layout with two sections: + * - .wrt-RateSections-sponsoredoffers (sponsored bank offers) + * - .wrt-RateSections-additionaloffers (additional bank offers) + * + * Each card (.rounded-md) contains: + * - Bank name in img[alt] (the logo) + * - APY after "APY as of" text + * - Min. deposit (CDs) or Min. balance for APY (savings/MM) + * - Term (CDs only): e.g. "1yr", "14mo" + * + * The page also has a summary table (.wealth-product-rate-list) with "best rates" + * per term but NO bank names — we explicitly skip this table. */ async function fetchRatesFromPage( browser: Browser, @@ -109,7 +123,7 @@ async function fetchRatesFromPage( ): Promise { const page: Page = await browser.newPage(); await page.setUserAgent( - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', ); try { @@ -120,13 +134,13 @@ async function fetchRatesFromPage( timeout: 60000, }); - // Wait for rate content to render - console.log('Waiting for rate data to render...'); + // Wait for rate card sections to render + console.log('Waiting for rate cards to render...'); await page.waitForSelector( - 'table, [data-testid*="rate"], .brc-table, [class*="ComparisonTable"], [class*="rate-table"]', + '.wrt-RateSections-sponsoredoffers .rounded-md, .wrt-RateSections-additionaloffers .rounded-md', { timeout: 30000 }, ).catch(() => { - console.log('Primary selectors not found, proceeding with page scan...'); + console.log('Bankrate card selectors not found, will try fallback...'); }); // Extra wait for dynamic content @@ -143,7 +157,7 @@ async function fetchRatesFromPage( }); await sleep(2000); - // Extract rate data from the page + // Extract rate data from individual bank offer cards const rawRates = await page.evaluate((maxRates: number) => { const results: Array<{ bank_name: string; @@ -152,120 +166,114 @@ async function fetchRatesFromPage( term_raw: string; }> = []; - // Strategy 1: Look for detailed bank comparison tables - const tables = document.querySelectorAll('table'); - for (const table of tables) { - const rows = table.querySelectorAll('tbody tr'); - if (rows.length < 3) continue; + // Primary strategy: extract from Bankrate offer cards + // Both sponsored and additional offer sections use the same card structure + const cards = [ + ...document.querySelectorAll('.wrt-RateSections-sponsoredoffers > .rounded-md'), + ...document.querySelectorAll('.wrt-RateSections-additionaloffers > .rounded-md'), + ]; - for (const row of rows) { - const cells = row.querySelectorAll('td, th'); - if (cells.length < 3) continue; + for (const card of cards) { + const text = card.textContent || ''; - const texts = Array.from(cells).map((c) => c.textContent?.trim() || ''); - const apyCell = texts.find((t) => /\d+\.\d+\s*%/.test(t)); - if (!apyCell) continue; + // Bank name: from the logo img alt attribute (most reliable) + const img = card.querySelector('img[alt]'); + let bankName = img ? (img as HTMLImageElement).alt.trim() : ''; - const bankCell = texts.find( - (t) => - t.length > 3 && - !/^\d/.test(t) && - !t.includes('%') && - !t.startsWith('$') && - !/^\d+\s*(month|year)/i.test(t), - ); - - const linkEl = row.querySelector('a[href*="review"], a[href*="bank"], img[alt]'); - const linkName = linkEl?.textContent?.trim() || (linkEl as HTMLImageElement)?.alt || ''; - - const name = linkName.length > 3 ? linkName : bankCell || ''; - if (!name) continue; - - results.push({ - bank_name: name, - apy_raw: apyCell, - min_deposit_raw: - texts.find((t) => t.includes('$') || /no min/i.test(t)) || '', - term_raw: texts.find((t) => /\d+\s*(month|year)/i.test(t)) || '', - }); - - if (results.length >= maxRates) break; - } - if (results.length >= 5) break; - } - - // Strategy 2: Look for card/list layouts - if (results.length < 5) { - const cardSelectors = [ - '[class*="product"]', - '[class*="offer-card"]', - '[class*="rate-card"]', - '[class*="ComparisonRow"]', - '[class*="comparison-row"]', - '[data-testid*="product"]', - '[class*="partner"]', - ]; - - for (const selector of cardSelectors) { - const cards = document.querySelectorAll(selector); - if (cards.length < 3) continue; - - for (const card of cards) { - const text = card.textContent || ''; - if (text.length < 20 || text.length > 2000) continue; - - const apyMatch = text.match(/([\d.]+)\s*%/); - if (!apyMatch) continue; - - const nameEl = - card.querySelector( - 'h2, h3, h4, h5, strong, [class*="name"], [class*="bank"], [class*="title"], a[href*="review"], img[alt]', - ); - let bankName = nameEl?.textContent?.trim() || (nameEl as HTMLImageElement)?.alt || ''; - - if (!bankName || bankName.length < 3 || /^\d/.test(bankName) || bankName.includes('%')) continue; - - const depositMatch = text.match(/\$[\d,]+/); - const termMatch = text.match(/\d+\s*(?:month|year)s?/i); - - results.push({ - bank_name: bankName, - apy_raw: apyMatch[0], - min_deposit_raw: depositMatch?.[0] || '', - term_raw: termMatch?.[0] || '', - }); - - if (results.length >= maxRates) break; + // Fallback: extract from text before "Add to compare" + if (!bankName) { + const addIdx = text.indexOf('Add to compare'); + if (addIdx > 0) { + bankName = text.substring(0, addIdx) + .replace(/Editor's pick/gi, '') + .trim(); } - if (results.length >= 5) break; } + + // Fallback: extract from product name pattern (e.g. "NexBank CD") + if (!bankName) { + const productMatch = text.match(/^(?:Editor's pick)?\s*([A-Z][\w\sĀ®*.'&-]+?(?:CD|Account|Savings|Money Market))/); + if (productMatch) bankName = productMatch[1].trim(); + } + + if (!bankName || bankName.length < 2) continue; + + // APY: find the percentage that appears after "APY as of" context. + // Avoid picking up the Bankrate score (e.g. "4.5 Bankrate CD score"). + // Use \b or (?= maxRates) break; } - // Strategy 3: Broad scan for rate-bearing elements - if (results.length < 5) { - const allElements = document.querySelectorAll( - 'div, section, article, li', + // Fallback strategy: if card-based extraction found nothing, + // scan for any elements with bank-like names and APY percentages. + // This guards against future Bankrate layout changes. + if (results.length === 0) { + const fallbackCards = document.querySelectorAll( + '[class*="product"], [class*="offer"], [class*="rate-card"], [class*="ComparisonRow"]', ); - for (const el of allElements) { - if (el.children.length > 20) continue; - const text = el.textContent || ''; - if (text.length < 20 || text.length > 500) continue; + for (const card of fallbackCards) { + const text = card.textContent || ''; + if (text.length < 20 || text.length > 2000) continue; - const apyMatch = text.match(/([\d.]+)\s*%\s*(?:APY)?/i); + const apyMatch = text.match(/(\d+\.?\d*)\s*%\s*(?:APY)?/); if (!apyMatch) continue; - const bankEl = el.querySelector( - 'h2, h3, h4, h5, strong, b, a[href*="review"]', - ); - let bankName = bankEl?.textContent?.trim() || ''; - if (!bankName || bankName.length < 3 || /^\d/.test(bankName)) continue; + const nameEl = card.querySelector('img[alt], h2, h3, h4, h5, [class*="name"], [class*="bank"]'); + const bankName = (nameEl as HTMLImageElement)?.alt + || nameEl?.textContent?.trim() + || ''; + if (!bankName || bankName.length < 2 || /^\d/.test(bankName) || bankName.includes('%')) continue; const depositMatch = text.match(/\$[\d,]+/); - const termMatch = text.match(/\d+\s*(?:month|year)s?/i); + const termMatch = text.match(/(\d+)\s*(?:month|year)s?/i); results.push({ bank_name: bankName, - apy_raw: apyMatch[0], + apy_raw: apyMatch[1] + '%', min_deposit_raw: depositMatch?.[0] || '', term_raw: termMatch?.[0] || '', }); @@ -284,20 +292,26 @@ async function fetchRatesFromPage( const parsed: MarketRate[] = rawRates .map((r) => { - let bankName = r.bank_name.replace(/\s+/g, ' ').trim(); + let bankName = r.bank_name + .replace(/\s+/g, ' ') + .replace(/Editor's pick/gi, '') + .trim(); + + // Strip trailing product suffixes to normalize bank name + // e.g. "Marcus by Goldman Sachs CD" → "Marcus by Goldman Sachs" + bankName = bankName + .replace(/\s+(CD|Certificate of Deposit|Money Market|Savings|High[- ]Yield Savings)\s*$/i, '') + .trim(); + const term = isTermProduct ? (r.term_raw || 'N/A') : 'N/A'; - // For CDs: if bank name looks like a term, label it descriptively - if (isTermProduct) { - const termText = r.term_raw || bankName; - if ( - /^\d+\s*(month|year)/i.test(bankName) || - /no\s*min/i.test(bankName) || - /^\$/.test(bankName) || - bankName.length < 4 - ) { - bankName = `Top CD Rate - ${termText.replace(/^\d+/, (m: string) => m + ' ')}`.replace(/\s+/g, ' ').trim(); - } + // Skip entries where bank_name still looks like a term or number (not a real bank) + if ( + /^\d+\s*(month|year)/i.test(bankName) || + /^\$/.test(bankName) || + bankName.length < 2 + ) { + return null; } return { @@ -305,11 +319,11 @@ async function fetchRatesFromPage( apy: parseApy(r.apy_raw), min_deposit: parseMinDeposit(r.min_deposit_raw), term, - term_months: isTermProduct ? parseTermMonths(r.term_raw || bankName) : null, + term_months: isTermProduct ? parseTermMonths(r.term_raw) : null, rate_type: rateType, }; }) - .filter((r) => r.bank_name && r.apy > 0); + .filter((r): r is MarketRate => r !== null && r.bank_name.length > 0 && r.apy > 0 && r.apy <= 20); // Deduplicate by bank name + term (keep highest APY) const seen = new Map();