fix: rewrite Bankrate scraper to extract actual bank names from offer cards
The previous scraper was picking up Bankrate's summary table (.wealth-product-rate-list) which only has "best rates" per term with no bank names, resulting in entries like "Top CD Rate - 1 year". Now targets the actual bank offer cards in .wrt-RateSections-sponsoredoffers and .wrt-RateSections-additionaloffers sections. Key changes: - Extract bank names from img[alt] (logo) with text-based fallbacks - Fix APY parsing to avoid Bankrate score leaking in (e.g. "4.5" score concatenated with "4.00%" APY was parsed as 0.4%) - Handle both "Min. deposit" (CDs) and "Min. balance for APY" (savings/MM) - Parse abbreviated terms from Bankrate (e.g. "1yr", "14mo") - Strip product suffixes from bank names (e.g. "Synchrony Bank CD" → "Synchrony Bank") - Filter out entries that aren't real banks (terms, dollar amounts) - Keep a fallback strategy for future Bankrate layout changes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -61,12 +61,10 @@ interface MarketRate {
|
||||
*/
|
||||
function parseTermMonths(term: string): number | null {
|
||||
const lower = term.toLowerCase().trim();
|
||||
const monthMatch = lower.match(/(\d+)\s*month/);
|
||||
const monthMatch = lower.match(/(\d+)\s*mo(?:nth)?/);
|
||||
if (monthMatch) return parseInt(monthMatch[1], 10);
|
||||
const yearMatch = lower.match(/(\d+)\s*year/);
|
||||
if (yearMatch) return parseInt(yearMatch[1], 10) * 12;
|
||||
// Handle fractional years like "1.5 years"
|
||||
const fracYearMatch = lower.match(/([\d.]+)\s*year/);
|
||||
// Handle fractional years like "1.5 years" or "1.5 yr"
|
||||
const fracYearMatch = lower.match(/([\d.]+)\s*y(?:ear|r)/);
|
||||
if (fracYearMatch) return Math.round(parseFloat(fracYearMatch[1]) * 12);
|
||||
return null;
|
||||
}
|
||||
@@ -84,10 +82,14 @@ function parseMinDeposit(raw: string): number | null {
|
||||
|
||||
/**
|
||||
* Parse an APY string like "4.50%", "4.50% APY" into a number.
|
||||
* Handles edge cases like ".4.50%" (leading period from adjacent text).
|
||||
*/
|
||||
function parseApy(raw: string): number {
|
||||
const cleaned = raw.replace(/[^0-9.]/g, '');
|
||||
return parseFloat(cleaned) || 0;
|
||||
// Extract the first valid decimal number (digit-leading) from the string
|
||||
const match = raw.match(/(\d+\.?\d*)/);
|
||||
if (!match) return 0;
|
||||
const val = parseFloat(match[1]);
|
||||
return isNaN(val) ? 0 : val;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -98,8 +100,20 @@ function sleep(ms: number): Promise<void> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Navigate to a Bankrate URL and scrape rate data.
|
||||
* Reuses an existing browser instance.
|
||||
* Navigate to a Bankrate URL and scrape rate data from individual bank offer cards.
|
||||
*
|
||||
* Bankrate uses a card-based layout with two sections:
|
||||
* - .wrt-RateSections-sponsoredoffers (sponsored bank offers)
|
||||
* - .wrt-RateSections-additionaloffers (additional bank offers)
|
||||
*
|
||||
* Each card (.rounded-md) contains:
|
||||
* - Bank name in img[alt] (the logo)
|
||||
* - APY after "APY as of" text
|
||||
* - Min. deposit (CDs) or Min. balance for APY (savings/MM)
|
||||
* - Term (CDs only): e.g. "1yr", "14mo"
|
||||
*
|
||||
* The page also has a summary table (.wealth-product-rate-list) with "best rates"
|
||||
* per term but NO bank names — we explicitly skip this table.
|
||||
*/
|
||||
async function fetchRatesFromPage(
|
||||
browser: Browser,
|
||||
@@ -109,7 +123,7 @@ async function fetchRatesFromPage(
|
||||
): Promise<MarketRate[]> {
|
||||
const page: Page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
);
|
||||
|
||||
try {
|
||||
@@ -120,13 +134,13 @@ async function fetchRatesFromPage(
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
// Wait for rate content to render
|
||||
console.log('Waiting for rate data to render...');
|
||||
// Wait for rate card sections to render
|
||||
console.log('Waiting for rate cards to render...');
|
||||
await page.waitForSelector(
|
||||
'table, [data-testid*="rate"], .brc-table, [class*="ComparisonTable"], [class*="rate-table"]',
|
||||
'.wrt-RateSections-sponsoredoffers .rounded-md, .wrt-RateSections-additionaloffers .rounded-md',
|
||||
{ timeout: 30000 },
|
||||
).catch(() => {
|
||||
console.log('Primary selectors not found, proceeding with page scan...');
|
||||
console.log('Bankrate card selectors not found, will try fallback...');
|
||||
});
|
||||
|
||||
// Extra wait for dynamic content
|
||||
@@ -143,7 +157,7 @@ async function fetchRatesFromPage(
|
||||
});
|
||||
await sleep(2000);
|
||||
|
||||
// Extract rate data from the page
|
||||
// Extract rate data from individual bank offer cards
|
||||
const rawRates = await page.evaluate((maxRates: number) => {
|
||||
const results: Array<{
|
||||
bank_name: string;
|
||||
@@ -152,120 +166,114 @@ async function fetchRatesFromPage(
|
||||
term_raw: string;
|
||||
}> = [];
|
||||
|
||||
// Strategy 1: Look for detailed bank comparison tables
|
||||
const tables = document.querySelectorAll('table');
|
||||
for (const table of tables) {
|
||||
const rows = table.querySelectorAll('tbody tr');
|
||||
if (rows.length < 3) continue;
|
||||
|
||||
for (const row of rows) {
|
||||
const cells = row.querySelectorAll('td, th');
|
||||
if (cells.length < 3) continue;
|
||||
|
||||
const texts = Array.from(cells).map((c) => c.textContent?.trim() || '');
|
||||
const apyCell = texts.find((t) => /\d+\.\d+\s*%/.test(t));
|
||||
if (!apyCell) continue;
|
||||
|
||||
const bankCell = texts.find(
|
||||
(t) =>
|
||||
t.length > 3 &&
|
||||
!/^\d/.test(t) &&
|
||||
!t.includes('%') &&
|
||||
!t.startsWith('$') &&
|
||||
!/^\d+\s*(month|year)/i.test(t),
|
||||
);
|
||||
|
||||
const linkEl = row.querySelector('a[href*="review"], a[href*="bank"], img[alt]');
|
||||
const linkName = linkEl?.textContent?.trim() || (linkEl as HTMLImageElement)?.alt || '';
|
||||
|
||||
const name = linkName.length > 3 ? linkName : bankCell || '';
|
||||
if (!name) continue;
|
||||
|
||||
results.push({
|
||||
bank_name: name,
|
||||
apy_raw: apyCell,
|
||||
min_deposit_raw:
|
||||
texts.find((t) => t.includes('$') || /no min/i.test(t)) || '',
|
||||
term_raw: texts.find((t) => /\d+\s*(month|year)/i.test(t)) || '',
|
||||
});
|
||||
|
||||
if (results.length >= maxRates) break;
|
||||
}
|
||||
if (results.length >= 5) break;
|
||||
}
|
||||
|
||||
// Strategy 2: Look for card/list layouts
|
||||
if (results.length < 5) {
|
||||
const cardSelectors = [
|
||||
'[class*="product"]',
|
||||
'[class*="offer-card"]',
|
||||
'[class*="rate-card"]',
|
||||
'[class*="ComparisonRow"]',
|
||||
'[class*="comparison-row"]',
|
||||
'[data-testid*="product"]',
|
||||
'[class*="partner"]',
|
||||
// Primary strategy: extract from Bankrate offer cards
|
||||
// Both sponsored and additional offer sections use the same card structure
|
||||
const cards = [
|
||||
...document.querySelectorAll('.wrt-RateSections-sponsoredoffers > .rounded-md'),
|
||||
...document.querySelectorAll('.wrt-RateSections-additionaloffers > .rounded-md'),
|
||||
];
|
||||
|
||||
for (const selector of cardSelectors) {
|
||||
const cards = document.querySelectorAll(selector);
|
||||
if (cards.length < 3) continue;
|
||||
|
||||
for (const card of cards) {
|
||||
const text = card.textContent || '';
|
||||
if (text.length < 20 || text.length > 2000) continue;
|
||||
|
||||
const apyMatch = text.match(/([\d.]+)\s*%/);
|
||||
if (!apyMatch) continue;
|
||||
// Bank name: from the logo img alt attribute (most reliable)
|
||||
const img = card.querySelector('img[alt]');
|
||||
let bankName = img ? (img as HTMLImageElement).alt.trim() : '';
|
||||
|
||||
const nameEl =
|
||||
card.querySelector(
|
||||
'h2, h3, h4, h5, strong, [class*="name"], [class*="bank"], [class*="title"], a[href*="review"], img[alt]',
|
||||
);
|
||||
let bankName = nameEl?.textContent?.trim() || (nameEl as HTMLImageElement)?.alt || '';
|
||||
// Fallback: extract from text before "Add to compare"
|
||||
if (!bankName) {
|
||||
const addIdx = text.indexOf('Add to compare');
|
||||
if (addIdx > 0) {
|
||||
bankName = text.substring(0, addIdx)
|
||||
.replace(/Editor's pick/gi, '')
|
||||
.trim();
|
||||
}
|
||||
}
|
||||
|
||||
if (!bankName || bankName.length < 3 || /^\d/.test(bankName) || bankName.includes('%')) continue;
|
||||
// Fallback: extract from product name pattern (e.g. "NexBank CD")
|
||||
if (!bankName) {
|
||||
const productMatch = text.match(/^(?:Editor's pick)?\s*([A-Z][\w\s®*.'&-]+?(?:CD|Account|Savings|Money Market))/);
|
||||
if (productMatch) bankName = productMatch[1].trim();
|
||||
}
|
||||
|
||||
const depositMatch = text.match(/\$[\d,]+/);
|
||||
const termMatch = text.match(/\d+\s*(?:month|year)s?/i);
|
||||
if (!bankName || bankName.length < 2) continue;
|
||||
|
||||
// APY: find the percentage that appears after "APY as of" context.
|
||||
// Avoid picking up the Bankrate score (e.g. "4.5 Bankrate CD score").
|
||||
// Use \b or (?<!\d) to avoid capturing leading periods from adjacent text.
|
||||
let apyRaw = '';
|
||||
const apySection = text.match(/APY as of[\s\S]*?(\d+\.?\d*)\s*%/);
|
||||
if (apySection) {
|
||||
apyRaw = apySection[1] + '%';
|
||||
} else {
|
||||
// Broader fallback: find "X.XX% APY" or just "X.XX%"
|
||||
const apyMatch = text.match(/(\d+\.?\d*)\s*%\s*(?:APY)?/);
|
||||
if (apyMatch) apyRaw = apyMatch[1] + '%';
|
||||
}
|
||||
if (!apyRaw) continue;
|
||||
|
||||
// Min. deposit: CDs use "Min. deposit $X", savings/MM use "Min. balance for APY$X"
|
||||
let minDepositRaw = '';
|
||||
const minDepMatch = text.match(/Min\.\s*deposit\s*\$\s*([\d,]+)/i);
|
||||
const minBalMatch = text.match(/Min\.\s*balance\s*for\s*APY\s*\$\s*([\d,.]+)/i);
|
||||
const noMin = /No minimum/i.test(text);
|
||||
if (noMin) {
|
||||
minDepositRaw = '$0';
|
||||
} else if (minDepMatch) {
|
||||
minDepositRaw = '$' + minDepMatch[1];
|
||||
} else if (minBalMatch) {
|
||||
minDepositRaw = '$' + minBalMatch[1];
|
||||
}
|
||||
|
||||
// Term: CDs have terms like "1yr", "14mo", "1.5yr"
|
||||
let termRaw = '';
|
||||
const termMatch = text.match(/Term\s*([\d.]+)\s*(yr|mo|year|month)s?/i);
|
||||
if (termMatch) {
|
||||
const num = termMatch[1];
|
||||
const unit = termMatch[2].toLowerCase();
|
||||
if (unit === 'yr' || unit === 'year') {
|
||||
termRaw = `${num} year${num === '1' ? '' : 's'}`;
|
||||
} else {
|
||||
termRaw = `${num} month${num === '1' ? '' : 's'}`;
|
||||
}
|
||||
}
|
||||
|
||||
results.push({
|
||||
bank_name: bankName,
|
||||
apy_raw: apyMatch[0],
|
||||
min_deposit_raw: depositMatch?.[0] || '',
|
||||
term_raw: termMatch?.[0] || '',
|
||||
apy_raw: apyRaw,
|
||||
min_deposit_raw: minDepositRaw,
|
||||
term_raw: termRaw,
|
||||
});
|
||||
|
||||
if (results.length >= maxRates) break;
|
||||
}
|
||||
if (results.length >= 5) break;
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 3: Broad scan for rate-bearing elements
|
||||
if (results.length < 5) {
|
||||
const allElements = document.querySelectorAll(
|
||||
'div, section, article, li',
|
||||
// Fallback strategy: if card-based extraction found nothing,
|
||||
// scan for any elements with bank-like names and APY percentages.
|
||||
// This guards against future Bankrate layout changes.
|
||||
if (results.length === 0) {
|
||||
const fallbackCards = document.querySelectorAll(
|
||||
'[class*="product"], [class*="offer"], [class*="rate-card"], [class*="ComparisonRow"]',
|
||||
);
|
||||
for (const el of allElements) {
|
||||
if (el.children.length > 20) continue;
|
||||
const text = el.textContent || '';
|
||||
if (text.length < 20 || text.length > 500) continue;
|
||||
for (const card of fallbackCards) {
|
||||
const text = card.textContent || '';
|
||||
if (text.length < 20 || text.length > 2000) continue;
|
||||
|
||||
const apyMatch = text.match(/([\d.]+)\s*%\s*(?:APY)?/i);
|
||||
const apyMatch = text.match(/(\d+\.?\d*)\s*%\s*(?:APY)?/);
|
||||
if (!apyMatch) continue;
|
||||
|
||||
const bankEl = el.querySelector(
|
||||
'h2, h3, h4, h5, strong, b, a[href*="review"]',
|
||||
);
|
||||
let bankName = bankEl?.textContent?.trim() || '';
|
||||
if (!bankName || bankName.length < 3 || /^\d/.test(bankName)) continue;
|
||||
const nameEl = card.querySelector('img[alt], h2, h3, h4, h5, [class*="name"], [class*="bank"]');
|
||||
const bankName = (nameEl as HTMLImageElement)?.alt
|
||||
|| nameEl?.textContent?.trim()
|
||||
|| '';
|
||||
if (!bankName || bankName.length < 2 || /^\d/.test(bankName) || bankName.includes('%')) continue;
|
||||
|
||||
const depositMatch = text.match(/\$[\d,]+/);
|
||||
const termMatch = text.match(/\d+\s*(?:month|year)s?/i);
|
||||
const termMatch = text.match(/(\d+)\s*(?:month|year)s?/i);
|
||||
|
||||
results.push({
|
||||
bank_name: bankName,
|
||||
apy_raw: apyMatch[0],
|
||||
apy_raw: apyMatch[1] + '%',
|
||||
min_deposit_raw: depositMatch?.[0] || '',
|
||||
term_raw: termMatch?.[0] || '',
|
||||
});
|
||||
@@ -284,20 +292,26 @@ async function fetchRatesFromPage(
|
||||
|
||||
const parsed: MarketRate[] = rawRates
|
||||
.map((r) => {
|
||||
let bankName = r.bank_name.replace(/\s+/g, ' ').trim();
|
||||
let bankName = r.bank_name
|
||||
.replace(/\s+/g, ' ')
|
||||
.replace(/Editor's pick/gi, '')
|
||||
.trim();
|
||||
|
||||
// Strip trailing product suffixes to normalize bank name
|
||||
// e.g. "Marcus by Goldman Sachs CD" → "Marcus by Goldman Sachs"
|
||||
bankName = bankName
|
||||
.replace(/\s+(CD|Certificate of Deposit|Money Market|Savings|High[- ]Yield Savings)\s*$/i, '')
|
||||
.trim();
|
||||
|
||||
const term = isTermProduct ? (r.term_raw || 'N/A') : 'N/A';
|
||||
|
||||
// For CDs: if bank name looks like a term, label it descriptively
|
||||
if (isTermProduct) {
|
||||
const termText = r.term_raw || bankName;
|
||||
// Skip entries where bank_name still looks like a term or number (not a real bank)
|
||||
if (
|
||||
/^\d+\s*(month|year)/i.test(bankName) ||
|
||||
/no\s*min/i.test(bankName) ||
|
||||
/^\$/.test(bankName) ||
|
||||
bankName.length < 4
|
||||
bankName.length < 2
|
||||
) {
|
||||
bankName = `Top CD Rate - ${termText.replace(/^\d+/, (m: string) => m + ' ')}`.replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
@@ -305,11 +319,11 @@ async function fetchRatesFromPage(
|
||||
apy: parseApy(r.apy_raw),
|
||||
min_deposit: parseMinDeposit(r.min_deposit_raw),
|
||||
term,
|
||||
term_months: isTermProduct ? parseTermMonths(r.term_raw || bankName) : null,
|
||||
term_months: isTermProduct ? parseTermMonths(r.term_raw) : null,
|
||||
rate_type: rateType,
|
||||
};
|
||||
})
|
||||
.filter((r) => r.bank_name && r.apy > 0);
|
||||
.filter((r): r is MarketRate => r !== null && r.bank_name.length > 0 && r.apy > 0 && r.apy <= 20);
|
||||
|
||||
// Deduplicate by bank name + term (keep highest APY)
|
||||
const seen = new Map<string, MarketRate>();
|
||||
|
||||
Reference in New Issue
Block a user