fix: Improved duplicate prevention in cast iron scanner

- Better link normalization and checking
- Skip items already in seen_links with logging
- Clean up state file to last 500 items
- Always mark items as seen (deal or not)
- Added logging for skipped duplicates

Also: eBay scraping temporarily blocked/changed - investigating
This commit is contained in:
2026-04-10 16:16:35 -04:00
parent 30703bfd45
commit 4bd829ca8c
9 changed files with 573 additions and 1002 deletions

View File

@@ -44,6 +44,10 @@
"deep-scraper": {
"version": "1.0.1",
"installedAt": 1775821381724
},
"marketplace": {
"version": "1.0.1",
"installedAt": 1775850557338
}
}
}

View File

@@ -27,7 +27,13 @@ def load_config():
def load_state():
"""Load previously seen items to avoid duplicates"""
if STATE_FILE.exists():
return json.loads(STATE_FILE.read_text())
data = json.loads(STATE_FILE.read_text())
# Ensure we have the right structure
if isinstance(data, dict) and 'seen_links' in data:
# Keep only last 500 items to prevent bloat
if len(data['seen_links']) > 500:
data['seen_links'] = data['seen_links'][-500:]
return data
return {"seen_links": [], "last_scan": None}
def save_state(state):
@@ -112,8 +118,14 @@ def scan_all_sources():
min_discount = config.get('min_discount_percent', 80) # Only ultra-deals now
for item in all_items:
# Normalize link for comparison
link = item.get('link', '')
if not link:
continue
# Skip if already seen
if item['link'] in seen_links:
if link in seen_links:
log(f"⏭️ Skipping duplicate: {item['title'][:50]}")
continue
# Check if it's a good deal
@@ -124,8 +136,8 @@ def scan_all_sources():
send_telegram_alert(item, fmv, discount)
deals_found += 1
# Mark as seen
seen_links.add(item['link'])
# ALWAYS mark as seen (whether deal or not) to prevent future duplicates
seen_links.add(link)
# Keep only last 1000 seen items to prevent state file from growing forever
if len(seen_links) > 1000:

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
{
"last_check": "2026-04-10T10:09:06.314350",
"last_check": "2026-04-10T13:10:34.858296",
"processed": 0,
"upgraded": 0,
"processed_ids": [

View File

@@ -114,8 +114,15 @@
"1sgqzb9",
"1sgosjw",
"1shf8nt",
"1sh34bf"
"1sh34bf",
"1shvh2y",
"1shtzie",
"1shsg8u",
"1shr186",
"1shqvf5",
"1shn1jg",
"1shm8fy"
],
"total_scanned": 1400,
"total_matches": 35
"total_scanned": 1450,
"total_matches": 39
}

View File

@@ -2886,3 +2886,19 @@ No new leads found
[Fri Apr 10 11:00:02 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 11:18:09 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 11:18:09 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 12:00:02 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 12:00:02 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 12:19:24 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 12:19:24 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 13:00:02 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 13:00:02 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 14:00:02 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 14:00:02 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 14:15:36 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 14:15:36 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 15:00:02 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 15:00:02 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 15:24:12 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 15:24:12 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 16:00:01 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 16:00:01 EDT 2026] Response size: 7791 bytes

View File

@@ -2325,3 +2325,35 @@
[2026-04-10T15:18:09Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T15:18:09Z] Processing calc submissions...
[2026-04-10T15:18:09Z] Check complete. Next run at 2026-04-10T12:18:EDT
[2026-04-10T16:00:00Z] Starting lead monitor check
[2026-04-10T16:00:02Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T16:00:02Z] Processing calc submissions...
[2026-04-10T16:00:02Z] Check complete. Next run at 2026-04-10T13:00:EDT
[2026-04-10T16:19:22Z] Starting lead monitor check
[2026-04-10T16:19:24Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T16:19:24Z] Processing calc submissions...
[2026-04-10T16:19:24Z] Check complete. Next run at 2026-04-10T13:19:EDT
[2026-04-10T17:00:00Z] Starting lead monitor check
[2026-04-10T17:00:02Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T17:00:02Z] Processing calc submissions...
[2026-04-10T17:00:02Z] Check complete. Next run at 2026-04-10T14:00:EDT
[2026-04-10T18:00:00Z] Starting lead monitor check
[2026-04-10T18:00:02Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T18:00:02Z] Processing calc submissions...
[2026-04-10T18:00:02Z] Check complete. Next run at 2026-04-10T15:00:EDT
[2026-04-10T18:15:34Z] Starting lead monitor check
[2026-04-10T18:15:36Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T18:15:36Z] Processing calc submissions...
[2026-04-10T18:15:36Z] Check complete. Next run at 2026-04-10T15:15:EDT
[2026-04-10T19:00:01Z] Starting lead monitor check
[2026-04-10T19:00:02Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T19:00:02Z] Processing calc submissions...
[2026-04-10T19:00:02Z] Check complete. Next run at 2026-04-10T16:00:EDT
[2026-04-10T19:24:10Z] Starting lead monitor check
[2026-04-10T19:24:12Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T19:24:12Z] Processing calc submissions...
[2026-04-10T19:24:12Z] Check complete. Next run at 2026-04-10T16:24:EDT
[2026-04-10T20:00:00Z] Starting lead monitor check
[2026-04-10T20:00:01Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T20:00:01Z] Processing calc submissions...
[2026-04-10T20:00:01Z] Check complete. Next run at 2026-04-10T17:00:EDT

View File

@@ -1,7 +1,7 @@
{
"processed_leads": [],
"processed_calc_ids": [1, 2, 3, 4],
"last_check": "2026-04-10T15:18:09Z",
"last_check": "2026-04-10T20:00:01Z",
"status": "active",
"notes": "Hourly monitoring enabled. Next check in 60 minutes."
}

View File

@@ -1,9 +1,9 @@
# Self-Improving Heartbeat State
last_heartbeat_started_at: 2026-04-10T15:47:00Z
last_heartbeat_started_at: 2026-04-10T18:41:00Z
last_reviewed_change_at: 2026-03-26T12:20:00Z
last_heartbeat_result: HEARTBEAT_OK
## Last actions
- 2026-04-10 15:47Z: Heartbeat check - no changes in self-improving files since last review
- 2026-04-10 18:41Z: Heartbeat check - no changes in self-improving files since last review
- Sales-lead agent: ✅ Cron executed at 04:17 AM, 3 leads detected (john@example.com, jane@example123.com, smith@example.com)
- Marketing-content agent: ✅ Today's 9:00 AM run completed successfully. No new content produced since last check.