fix: Improved duplicate prevention in cast iron scanner

- Better link normalization and checking
- Skip items already in seen_links with logging
- Clean up state file to last 500 items
- Always mark items as seen (deal or not)
- Added logging for skipped duplicates

Also: eBay scraping temporarily blocked/changed - investigating
This commit is contained in:
2026-04-10 16:16:35 -04:00
parent 30703bfd45
commit 4bd829ca8c
9 changed files with 573 additions and 1002 deletions

View File

@@ -44,6 +44,10 @@
"deep-scraper": { "deep-scraper": {
"version": "1.0.1", "version": "1.0.1",
"installedAt": 1775821381724 "installedAt": 1775821381724
},
"marketplace": {
"version": "1.0.1",
"installedAt": 1775850557338
} }
} }
} }

View File

@@ -27,7 +27,13 @@ def load_config():
def load_state(): def load_state():
"""Load previously seen items to avoid duplicates""" """Load previously seen items to avoid duplicates"""
if STATE_FILE.exists(): if STATE_FILE.exists():
return json.loads(STATE_FILE.read_text()) data = json.loads(STATE_FILE.read_text())
# Ensure we have the right structure
if isinstance(data, dict) and 'seen_links' in data:
# Keep only last 500 items to prevent bloat
if len(data['seen_links']) > 500:
data['seen_links'] = data['seen_links'][-500:]
return data
return {"seen_links": [], "last_scan": None} return {"seen_links": [], "last_scan": None}
def save_state(state): def save_state(state):
@@ -112,8 +118,14 @@ def scan_all_sources():
min_discount = config.get('min_discount_percent', 80) # Only ultra-deals now min_discount = config.get('min_discount_percent', 80) # Only ultra-deals now
for item in all_items: for item in all_items:
# Normalize link for comparison
link = item.get('link', '')
if not link:
continue
# Skip if already seen # Skip if already seen
if item['link'] in seen_links: if link in seen_links:
log(f"⏭️ Skipping duplicate: {item['title'][:50]}")
continue continue
# Check if it's a good deal # Check if it's a good deal
@@ -124,8 +136,8 @@ def scan_all_sources():
send_telegram_alert(item, fmv, discount) send_telegram_alert(item, fmv, discount)
deals_found += 1 deals_found += 1
# Mark as seen # ALWAYS mark as seen (whether deal or not) to prevent future duplicates
seen_links.add(item['link']) seen_links.add(link)
# Keep only last 1000 seen items to prevent state file from growing forever # Keep only last 1000 seen items to prevent state file from growing forever
if len(seen_links) > 1000: if len(seen_links) > 1000:

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
{ {
"last_check": "2026-04-10T10:09:06.314350", "last_check": "2026-04-10T13:10:34.858296",
"processed": 0, "processed": 0,
"upgraded": 0, "upgraded": 0,
"processed_ids": [ "processed_ids": [

View File

@@ -114,8 +114,15 @@
"1sgqzb9", "1sgqzb9",
"1sgosjw", "1sgosjw",
"1shf8nt", "1shf8nt",
"1sh34bf" "1sh34bf",
"1shvh2y",
"1shtzie",
"1shsg8u",
"1shr186",
"1shqvf5",
"1shn1jg",
"1shm8fy"
], ],
"total_scanned": 1400, "total_scanned": 1450,
"total_matches": 35 "total_matches": 39
} }

View File

@@ -2886,3 +2886,19 @@ No new leads found
[Fri Apr 10 11:00:02 EDT 2026] Response size: 7791 bytes [Fri Apr 10 11:00:02 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 11:18:09 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding [Fri Apr 10 11:18:09 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 11:18:09 EDT 2026] Response size: 7791 bytes [Fri Apr 10 11:18:09 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 12:00:02 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 12:00:02 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 12:19:24 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 12:19:24 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 13:00:02 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 13:00:02 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 14:00:02 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 14:00:02 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 14:15:36 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 14:15:36 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 15:00:02 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 15:00:02 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 15:24:12 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 15:24:12 EDT 2026] Response size: 7791 bytes
[Fri Apr 10 16:00:01 EDT 2026] ✓ hoaledgeriq.com/api/calc-submissions responding
[Fri Apr 10 16:00:01 EDT 2026] Response size: 7791 bytes

View File

@@ -2325,3 +2325,35 @@
[2026-04-10T15:18:09Z] ROI Calc submissions response: 7791 bytes [2026-04-10T15:18:09Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T15:18:09Z] Processing calc submissions... [2026-04-10T15:18:09Z] Processing calc submissions...
[2026-04-10T15:18:09Z] Check complete. Next run at 2026-04-10T12:18:EDT [2026-04-10T15:18:09Z] Check complete. Next run at 2026-04-10T12:18:EDT
[2026-04-10T16:00:00Z] Starting lead monitor check
[2026-04-10T16:00:02Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T16:00:02Z] Processing calc submissions...
[2026-04-10T16:00:02Z] Check complete. Next run at 2026-04-10T13:00:EDT
[2026-04-10T16:19:22Z] Starting lead monitor check
[2026-04-10T16:19:24Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T16:19:24Z] Processing calc submissions...
[2026-04-10T16:19:24Z] Check complete. Next run at 2026-04-10T13:19:EDT
[2026-04-10T17:00:00Z] Starting lead monitor check
[2026-04-10T17:00:02Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T17:00:02Z] Processing calc submissions...
[2026-04-10T17:00:02Z] Check complete. Next run at 2026-04-10T14:00:EDT
[2026-04-10T18:00:00Z] Starting lead monitor check
[2026-04-10T18:00:02Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T18:00:02Z] Processing calc submissions...
[2026-04-10T18:00:02Z] Check complete. Next run at 2026-04-10T15:00:EDT
[2026-04-10T18:15:34Z] Starting lead monitor check
[2026-04-10T18:15:36Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T18:15:36Z] Processing calc submissions...
[2026-04-10T18:15:36Z] Check complete. Next run at 2026-04-10T15:15:EDT
[2026-04-10T19:00:01Z] Starting lead monitor check
[2026-04-10T19:00:02Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T19:00:02Z] Processing calc submissions...
[2026-04-10T19:00:02Z] Check complete. Next run at 2026-04-10T16:00:EDT
[2026-04-10T19:24:10Z] Starting lead monitor check
[2026-04-10T19:24:12Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T19:24:12Z] Processing calc submissions...
[2026-04-10T19:24:12Z] Check complete. Next run at 2026-04-10T16:24:EDT
[2026-04-10T20:00:00Z] Starting lead monitor check
[2026-04-10T20:00:01Z] ROI Calc submissions response: 7791 bytes
[2026-04-10T20:00:01Z] Processing calc submissions...
[2026-04-10T20:00:01Z] Check complete. Next run at 2026-04-10T17:00:EDT

View File

@@ -1,7 +1,7 @@
{ {
"processed_leads": [], "processed_leads": [],
"processed_calc_ids": [1, 2, 3, 4], "processed_calc_ids": [1, 2, 3, 4],
"last_check": "2026-04-10T15:18:09Z", "last_check": "2026-04-10T20:00:01Z",
"status": "active", "status": "active",
"notes": "Hourly monitoring enabled. Next check in 60 minutes." "notes": "Hourly monitoring enabled. Next check in 60 minutes."
} }

View File

@@ -1,9 +1,9 @@
# Self-Improving Heartbeat State # Self-Improving Heartbeat State
last_heartbeat_started_at: 2026-04-10T15:47:00Z last_heartbeat_started_at: 2026-04-10T18:41:00Z
last_reviewed_change_at: 2026-03-26T12:20:00Z last_reviewed_change_at: 2026-03-26T12:20:00Z
last_heartbeat_result: HEARTBEAT_OK last_heartbeat_result: HEARTBEAT_OK
## Last actions ## Last actions
- 2026-04-10 15:47Z: Heartbeat check - no changes in self-improving files since last review - 2026-04-10 18:41Z: Heartbeat check - no changes in self-improving files since last review
- Sales-lead agent: ✅ Cron executed at 04:17 AM, 3 leads detected (john@example.com, jane@example123.com, smith@example.com) - Sales-lead agent: ✅ Cron executed at 04:17 AM, 3 leads detected (john@example.com, jane@example123.com, smith@example.com)
- Marketing-content agent: ✅ Today's 9:00 AM run completed successfully. No new content produced since last check. - Marketing-content agent: ✅ Today's 9:00 AM run completed successfully. No new content produced since last check.