The previous scraper was picking up Bankrate's summary table (.wealth-product-rate-list) which only has "best rates" per term with no bank names, resulting in entries like "Top CD Rate - 1 year". Now targets the actual bank offer cards in .wrt-RateSections-sponsoredoffers and .wrt-RateSections-additionaloffers sections. Key changes: - Extract bank names from img[alt] (logo) with text-based fallbacks - Fix APY parsing to avoid Bankrate score leaking in (e.g. "4.5" score concatenated with "4.00%" APY was parsed as 0.4%) - Handle both "Min. deposit" (CDs) and "Min. balance for APY" (savings/MM) - Parse abbreviated terms from Bankrate (e.g. "1yr", "14mo") - Strip product suffixes from bank names (e.g. "Synchrony Bank CD" → "Synchrony Bank") - Filter out entries that aren't real banks (terms, dollar amounts) - Keep a fallback strategy for future Bankrate layout changes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
473 lines
15 KiB
TypeScript
473 lines
15 KiB
TypeScript
#!/usr/bin/env tsx
|
|
/**
|
|
* Market Rate Fetcher Script
|
|
*
|
|
* Scrapes the top CD, Money Market, and High Yield Savings rates from
|
|
* Bankrate.com and stores them in the shared.cd_rates table in PostgreSQL.
|
|
* Designed to run standalone via cron (once per day).
|
|
*
|
|
* Historical data is preserved — each fetch adds new rows with the current
|
|
* timestamp. The application queries only the latest batch per rate type.
|
|
*
|
|
* Usage:
|
|
* cd scripts
|
|
* npm install
|
|
* npx tsx fetch-cd-rates.ts
|
|
*
|
|
* Environment:
|
|
* DATABASE_URL - PostgreSQL connection string (reads from ../.env)
|
|
*/
|
|
|
|
import * as dotenv from 'dotenv';
|
|
import { resolve } from 'path';
|
|
import { Pool } from 'pg';
|
|
import puppeteer, { type Browser, type Page } from 'puppeteer';
|
|
|
|
// Load .env from project root
|
|
dotenv.config({ path: resolve(__dirname, '..', '.env') });
|
|
|
|
const MAX_RATES = 25;
|
|
|
|
// Rate source configurations
|
|
const RATE_SOURCES = [
|
|
{
|
|
type: 'cd',
|
|
label: 'CD Rates',
|
|
url: 'https://www.bankrate.com/banking/cds/cd-rates/',
|
|
},
|
|
{
|
|
type: 'high_yield_savings',
|
|
label: 'High Yield Savings',
|
|
url: 'https://www.bankrate.com/banking/savings/best-high-yield-interests-savings-accounts/',
|
|
},
|
|
{
|
|
type: 'money_market',
|
|
label: 'Money Market',
|
|
url: 'https://www.bankrate.com/banking/money-market/rates/',
|
|
},
|
|
];
|
|
|
|
interface MarketRate {
|
|
bank_name: string;
|
|
apy: number;
|
|
min_deposit: number | null;
|
|
term: string;
|
|
term_months: number | null;
|
|
rate_type: string;
|
|
}
|
|
|
|
/**
|
|
* Parse a term string like "3 months", "1 year", "18 months" into a month count.
|
|
*/
|
|
function parseTermMonths(term: string): number | null {
|
|
const lower = term.toLowerCase().trim();
|
|
const monthMatch = lower.match(/(\d+)\s*mo(?:nth)?/);
|
|
if (monthMatch) return parseInt(monthMatch[1], 10);
|
|
// Handle fractional years like "1.5 years" or "1.5 yr"
|
|
const fracYearMatch = lower.match(/([\d.]+)\s*y(?:ear|r)/);
|
|
if (fracYearMatch) return Math.round(parseFloat(fracYearMatch[1]) * 12);
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Parse a currency string like "$500", "$1,000", "$0", "No minimum" into a number or null.
|
|
*/
|
|
function parseMinDeposit(raw: string): number | null {
|
|
if (!raw) return null;
|
|
const cleaned = raw.replace(/[^0-9.]/g, '');
|
|
if (!cleaned) return null;
|
|
const val = parseFloat(cleaned);
|
|
return isNaN(val) ? null : val;
|
|
}
|
|
|
|
/**
|
|
* Parse an APY string like "4.50%", "4.50% APY" into a number.
|
|
* Handles edge cases like ".4.50%" (leading period from adjacent text).
|
|
*/
|
|
function parseApy(raw: string): number {
|
|
// Extract the first valid decimal number (digit-leading) from the string
|
|
const match = raw.match(/(\d+\.?\d*)/);
|
|
if (!match) return 0;
|
|
const val = parseFloat(match[1]);
|
|
return isNaN(val) ? 0 : val;
|
|
}
|
|
|
|
/**
|
|
* Pause execution for a given number of milliseconds.
|
|
*/
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
/**
|
|
* Navigate to a Bankrate URL and scrape rate data from individual bank offer cards.
|
|
*
|
|
* Bankrate uses a card-based layout with two sections:
|
|
* - .wrt-RateSections-sponsoredoffers (sponsored bank offers)
|
|
* - .wrt-RateSections-additionaloffers (additional bank offers)
|
|
*
|
|
* Each card (.rounded-md) contains:
|
|
* - Bank name in img[alt] (the logo)
|
|
* - APY after "APY as of" text
|
|
* - Min. deposit (CDs) or Min. balance for APY (savings/MM)
|
|
* - Term (CDs only): e.g. "1yr", "14mo"
|
|
*
|
|
* The page also has a summary table (.wealth-product-rate-list) with "best rates"
|
|
* per term but NO bank names — we explicitly skip this table.
|
|
*/
|
|
async function fetchRatesFromPage(
|
|
browser: Browser,
|
|
sourceUrl: string,
|
|
rateType: string,
|
|
label: string,
|
|
): Promise<MarketRate[]> {
|
|
const page: Page = await browser.newPage();
|
|
await page.setUserAgent(
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
);
|
|
|
|
try {
|
|
console.log(`\n--- Fetching ${label} ---`);
|
|
console.log(`Navigating to ${sourceUrl}...`);
|
|
await page.goto(sourceUrl, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000,
|
|
});
|
|
|
|
// Wait for rate card sections to render
|
|
console.log('Waiting for rate cards to render...');
|
|
await page.waitForSelector(
|
|
'.wrt-RateSections-sponsoredoffers .rounded-md, .wrt-RateSections-additionaloffers .rounded-md',
|
|
{ timeout: 30000 },
|
|
).catch(() => {
|
|
console.log('Bankrate card selectors not found, will try fallback...');
|
|
});
|
|
|
|
// Extra wait for dynamic content
|
|
await sleep(3000);
|
|
|
|
// Scroll down to load all content
|
|
console.log('Scrolling to load all content...');
|
|
await page.evaluate(async () => {
|
|
for (let i = 0; i < 10; i++) {
|
|
window.scrollBy(0, 800);
|
|
await new Promise((r) => setTimeout(r, 500));
|
|
}
|
|
window.scrollTo(0, 0);
|
|
});
|
|
await sleep(2000);
|
|
|
|
// Extract rate data from individual bank offer cards
|
|
const rawRates = await page.evaluate((maxRates: number) => {
|
|
const results: Array<{
|
|
bank_name: string;
|
|
apy_raw: string;
|
|
min_deposit_raw: string;
|
|
term_raw: string;
|
|
}> = [];
|
|
|
|
// Primary strategy: extract from Bankrate offer cards
|
|
// Both sponsored and additional offer sections use the same card structure
|
|
const cards = [
|
|
...document.querySelectorAll('.wrt-RateSections-sponsoredoffers > .rounded-md'),
|
|
...document.querySelectorAll('.wrt-RateSections-additionaloffers > .rounded-md'),
|
|
];
|
|
|
|
for (const card of cards) {
|
|
const text = card.textContent || '';
|
|
|
|
// Bank name: from the logo img alt attribute (most reliable)
|
|
const img = card.querySelector('img[alt]');
|
|
let bankName = img ? (img as HTMLImageElement).alt.trim() : '';
|
|
|
|
// Fallback: extract from text before "Add to compare"
|
|
if (!bankName) {
|
|
const addIdx = text.indexOf('Add to compare');
|
|
if (addIdx > 0) {
|
|
bankName = text.substring(0, addIdx)
|
|
.replace(/Editor's pick/gi, '')
|
|
.trim();
|
|
}
|
|
}
|
|
|
|
// Fallback: extract from product name pattern (e.g. "NexBank CD")
|
|
if (!bankName) {
|
|
const productMatch = text.match(/^(?:Editor's pick)?\s*([A-Z][\w\s®*.'&-]+?(?:CD|Account|Savings|Money Market))/);
|
|
if (productMatch) bankName = productMatch[1].trim();
|
|
}
|
|
|
|
if (!bankName || bankName.length < 2) continue;
|
|
|
|
// APY: find the percentage that appears after "APY as of" context.
|
|
// Avoid picking up the Bankrate score (e.g. "4.5 Bankrate CD score").
|
|
// Use \b or (?<!\d) to avoid capturing leading periods from adjacent text.
|
|
let apyRaw = '';
|
|
const apySection = text.match(/APY as of[\s\S]*?(\d+\.?\d*)\s*%/);
|
|
if (apySection) {
|
|
apyRaw = apySection[1] + '%';
|
|
} else {
|
|
// Broader fallback: find "X.XX% APY" or just "X.XX%"
|
|
const apyMatch = text.match(/(\d+\.?\d*)\s*%\s*(?:APY)?/);
|
|
if (apyMatch) apyRaw = apyMatch[1] + '%';
|
|
}
|
|
if (!apyRaw) continue;
|
|
|
|
// Min. deposit: CDs use "Min. deposit $X", savings/MM use "Min. balance for APY$X"
|
|
let minDepositRaw = '';
|
|
const minDepMatch = text.match(/Min\.\s*deposit\s*\$\s*([\d,]+)/i);
|
|
const minBalMatch = text.match(/Min\.\s*balance\s*for\s*APY\s*\$\s*([\d,.]+)/i);
|
|
const noMin = /No minimum/i.test(text);
|
|
if (noMin) {
|
|
minDepositRaw = '$0';
|
|
} else if (minDepMatch) {
|
|
minDepositRaw = '$' + minDepMatch[1];
|
|
} else if (minBalMatch) {
|
|
minDepositRaw = '$' + minBalMatch[1];
|
|
}
|
|
|
|
// Term: CDs have terms like "1yr", "14mo", "1.5yr"
|
|
let termRaw = '';
|
|
const termMatch = text.match(/Term\s*([\d.]+)\s*(yr|mo|year|month)s?/i);
|
|
if (termMatch) {
|
|
const num = termMatch[1];
|
|
const unit = termMatch[2].toLowerCase();
|
|
if (unit === 'yr' || unit === 'year') {
|
|
termRaw = `${num} year${num === '1' ? '' : 's'}`;
|
|
} else {
|
|
termRaw = `${num} month${num === '1' ? '' : 's'}`;
|
|
}
|
|
}
|
|
|
|
results.push({
|
|
bank_name: bankName,
|
|
apy_raw: apyRaw,
|
|
min_deposit_raw: minDepositRaw,
|
|
term_raw: termRaw,
|
|
});
|
|
|
|
if (results.length >= maxRates) break;
|
|
}
|
|
|
|
// Fallback strategy: if card-based extraction found nothing,
|
|
// scan for any elements with bank-like names and APY percentages.
|
|
// This guards against future Bankrate layout changes.
|
|
if (results.length === 0) {
|
|
const fallbackCards = document.querySelectorAll(
|
|
'[class*="product"], [class*="offer"], [class*="rate-card"], [class*="ComparisonRow"]',
|
|
);
|
|
for (const card of fallbackCards) {
|
|
const text = card.textContent || '';
|
|
if (text.length < 20 || text.length > 2000) continue;
|
|
|
|
const apyMatch = text.match(/(\d+\.?\d*)\s*%\s*(?:APY)?/);
|
|
if (!apyMatch) continue;
|
|
|
|
const nameEl = card.querySelector('img[alt], h2, h3, h4, h5, [class*="name"], [class*="bank"]');
|
|
const bankName = (nameEl as HTMLImageElement)?.alt
|
|
|| nameEl?.textContent?.trim()
|
|
|| '';
|
|
if (!bankName || bankName.length < 2 || /^\d/.test(bankName) || bankName.includes('%')) continue;
|
|
|
|
const depositMatch = text.match(/\$[\d,]+/);
|
|
const termMatch = text.match(/(\d+)\s*(?:month|year)s?/i);
|
|
|
|
results.push({
|
|
bank_name: bankName,
|
|
apy_raw: apyMatch[1] + '%',
|
|
min_deposit_raw: depositMatch?.[0] || '',
|
|
term_raw: termMatch?.[0] || '',
|
|
});
|
|
|
|
if (results.length >= maxRates) break;
|
|
}
|
|
}
|
|
|
|
return results;
|
|
}, MAX_RATES);
|
|
|
|
console.log(`Raw extraction found ${rawRates.length} rate entries.`);
|
|
|
|
// Parse and normalize
|
|
const isTermProduct = rateType === 'cd';
|
|
|
|
const parsed: MarketRate[] = rawRates
|
|
.map((r) => {
|
|
let bankName = r.bank_name
|
|
.replace(/\s+/g, ' ')
|
|
.replace(/Editor's pick/gi, '')
|
|
.trim();
|
|
|
|
// Strip trailing product suffixes to normalize bank name
|
|
// e.g. "Marcus by Goldman Sachs CD" → "Marcus by Goldman Sachs"
|
|
bankName = bankName
|
|
.replace(/\s+(CD|Certificate of Deposit|Money Market|Savings|High[- ]Yield Savings)\s*$/i, '')
|
|
.trim();
|
|
|
|
const term = isTermProduct ? (r.term_raw || 'N/A') : 'N/A';
|
|
|
|
// Skip entries where bank_name still looks like a term or number (not a real bank)
|
|
if (
|
|
/^\d+\s*(month|year)/i.test(bankName) ||
|
|
/^\$/.test(bankName) ||
|
|
bankName.length < 2
|
|
) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
bank_name: bankName,
|
|
apy: parseApy(r.apy_raw),
|
|
min_deposit: parseMinDeposit(r.min_deposit_raw),
|
|
term,
|
|
term_months: isTermProduct ? parseTermMonths(r.term_raw) : null,
|
|
rate_type: rateType,
|
|
};
|
|
})
|
|
.filter((r): r is MarketRate => r !== null && r.bank_name.length > 0 && r.apy > 0 && r.apy <= 20);
|
|
|
|
// Deduplicate by bank name + term (keep highest APY)
|
|
const seen = new Map<string, MarketRate>();
|
|
for (const rate of parsed) {
|
|
const key = `${rate.bank_name}|${rate.term}`;
|
|
const existing = seen.get(key);
|
|
if (!existing || rate.apy > existing.apy) {
|
|
seen.set(key, rate);
|
|
}
|
|
}
|
|
|
|
return Array.from(seen.values())
|
|
.sort((a, b) => b.apy - a.apy)
|
|
.slice(0, MAX_RATES);
|
|
} finally {
|
|
await page.close();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Store scraped rates into shared.cd_rates.
|
|
* Historical data is preserved — we no longer delete previous rows.
|
|
* Each fetch batch shares a common fetched_at timestamp per rate_type.
|
|
*/
|
|
async function storeRates(rates: MarketRate[], sourceUrl: string): Promise<void> {
|
|
const connectionString =
|
|
process.env.DATABASE_URL ||
|
|
'postgresql://hoafinance:change_me@localhost:5432/hoafinance';
|
|
|
|
const pool = new Pool({ connectionString });
|
|
const client = await pool.connect();
|
|
|
|
try {
|
|
await client.query('BEGIN');
|
|
|
|
const now = new Date().toISOString();
|
|
|
|
for (const rate of rates) {
|
|
await client.query(
|
|
`INSERT INTO shared.cd_rates
|
|
(bank_name, apy, min_deposit, term, term_months, rate_type, fetched_at, source_url)
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`,
|
|
[
|
|
rate.bank_name,
|
|
rate.apy,
|
|
rate.min_deposit,
|
|
rate.term,
|
|
rate.term_months,
|
|
rate.rate_type,
|
|
now,
|
|
sourceUrl,
|
|
],
|
|
);
|
|
}
|
|
|
|
await client.query('COMMIT');
|
|
console.log(` Stored ${rates.length} ${rates[0]?.rate_type || ''} rates at ${now}`);
|
|
} catch (err) {
|
|
await client.query('ROLLBACK');
|
|
throw err;
|
|
} finally {
|
|
client.release();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main entry point.
|
|
*/
|
|
async function main() {
|
|
console.log('=== Market Rate Fetcher ===');
|
|
console.log(`Fetching rates from Bankrate.com...`);
|
|
console.log(`Time: ${new Date().toISOString()}`);
|
|
console.log(`Rate types: ${RATE_SOURCES.map((s) => s.label).join(', ')}`);
|
|
|
|
let browser: Browser | null = null;
|
|
|
|
try {
|
|
console.log('\nLaunching headless browser...');
|
|
browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
],
|
|
});
|
|
|
|
let totalStored = 0;
|
|
|
|
for (let i = 0; i < RATE_SOURCES.length; i++) {
|
|
const source = RATE_SOURCES[i];
|
|
|
|
// Pause between fetches to avoid rate limiting (skip for first)
|
|
if (i > 0) {
|
|
const pauseSeconds = 8 + Math.floor(Math.random() * 5); // 8-12 seconds
|
|
console.log(`\nPausing ${pauseSeconds} seconds before next fetch...`);
|
|
await sleep(pauseSeconds * 1000);
|
|
}
|
|
|
|
try {
|
|
const rates = await fetchRatesFromPage(browser, source.url, source.type, source.label);
|
|
|
|
if (rates.length === 0) {
|
|
console.warn(`\nWARNING: No ${source.label} rates were extracted.`);
|
|
console.warn('This may mean Bankrate changed their page structure.');
|
|
continue; // Don't abort the whole run — try other rate types
|
|
}
|
|
|
|
console.log(`\nExtracted ${rates.length} ${source.label}:`);
|
|
console.log('\u2500'.repeat(80));
|
|
for (const r of rates) {
|
|
const termStr = r.term !== 'N/A' ? r.term.padEnd(15) : ''.padEnd(15);
|
|
console.log(
|
|
` ${r.bank_name.padEnd(35)} ${String(r.apy + '%').padEnd(8)} ${termStr} ${r.min_deposit != null ? '$' + r.min_deposit.toLocaleString() : 'N/A'}`,
|
|
);
|
|
}
|
|
console.log('\u2500'.repeat(80));
|
|
|
|
console.log(`\nStoring ${source.label} to database...`);
|
|
await storeRates(rates, source.url);
|
|
totalStored += rates.length;
|
|
} catch (err: any) {
|
|
console.error(`\nERROR fetching ${source.label}: ${err.message}`);
|
|
// Continue to next rate type
|
|
}
|
|
}
|
|
|
|
if (totalStored === 0) {
|
|
console.warn('\nWARNING: No rates were stored for any type.');
|
|
console.warn('Review Bankrate page structure and update selectors.');
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log(`\nDone. Total rates stored: ${totalStored}`);
|
|
} catch (err) {
|
|
console.error('\nFATAL ERROR:', err);
|
|
process.exit(1);
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close();
|
|
}
|
|
}
|
|
}
|
|
|
|
main();
|