import praw import re from collections import Counter import sqlite3 import json import datetime import time import logging # Tunable config variables CLIENT_ID = 'TLca7oyo1uA5IBMhLouIag' CLIENT_SECRET = '6bb9HIt4_K8FV92UOWmwM8cFjoyiTQ' USER_AGENT = 'wsb_ticker_scanner v1.0' # Change if needed for uniqueness SUBREDDIT = 'wallstreetbets' DB_FILE = 'data/wsb_mentions.db' WATCHLIST_FILE = 'watchlist.json' MAX_RETRIES = 3 # Max API retry attempts RETRY_BACKOFF = 5 # Seconds to wait between retries (increases exponentially) THROTTLE_SLEEP = 1 # Seconds to sleep between post fetches to avoid rate limits # Set up logging logging.basicConfig(filename='data/wsb_scanner.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logging.info('Starting WSB ticker scan') try: # Load watchlist (set of tickers for fast lookup) try: with open(WATCHLIST_FILE, 'r') as f: watchlist_data = json.load(f) watchlist = set(watchlist_data.get('tickers', [])) logging.info(f'Loaded {len(watchlist)} tickers from watchlist') except FileNotFoundError: watchlist = set() logging.warning('Watchlist file not found; using empty watchlist') except json.JSONDecodeError: watchlist = set() logging.error('Invalid JSON in watchlist; using empty watchlist') # Authenticate with Reddit API reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=USER_AGENT) subreddit = reddit.subreddit(SUBREDDIT) logging.info(f'Authenticated with Reddit API for r/{SUBREDDIT}') # Prepare for scanning: 24-hour cutoff (in Unix timestamp) cutoff_time = time.time() - 86400 # 24 hours ago mention_counter = Counter() # Function for retry logic def fetch_with_retry(func, *args, **kwargs): for attempt in range(MAX_RETRIES): try: return func(*args, **kwargs) except praw.exceptions.PRAWException as e: # Catch API errors (e.g., rate limits) if attempt == MAX_RETRIES - 1: raise wait_time = RETRY_BACKOFF * (2 ** attempt) # Exponential backoff logging.warning(f'API error: {e}. Retrying in {wait_time}s (attempt {attempt+1}/{MAX_RETRIES})') time.sleep(wait_time) # Scan new posts until cutoff post_count = 0 for submission in subreddit.new(limit=None): # Fetch submission with retry submission = fetch_with_retry(lambda: submission) # In case of lazy loading issues if submission.created_utc < cutoff_time: logging.info(f'Reached 24-hour cutoff after {post_count} posts') break # Combine title and body text text = submission.title + ' ' + (submission.selftext or '') # Find tickers: $ followed by 1-5 uppercase letters tickers = re.findall(r'\$([A-Z]{1,5})', text) # Update counter mention_counter.update(tickers) post_count += 1 time.sleep(THROTTLE_SLEEP) # Throttle to be safe logging.info(f'Scanned {post_count} posts; found mentions for {len(mention_counter)} unique tickers') # Get today's date for DB insert today = datetime.date.today().isoformat() # Connect to SQLite DB conn = sqlite3.connect(DB_FILE) cursor = conn.cursor() # Create table if not exists (with UNIQUE constraint for idempotency) cursor.execute(''' CREATE TABLE IF NOT EXISTS ticker_mentions ( date TEXT, ticker TEXT, count INTEGER, watched INTEGER, UNIQUE(date, ticker) ) ''') # Insert or replace counts for today for ticker, count in mention_counter.items(): watched = 1 if ticker in watchlist else 0 cursor.execute(''' INSERT OR REPLACE INTO ticker_mentions (date, ticker, count, watched) VALUES (?, ?, ?, ?) ''', (today, ticker, count, watched)) conn.commit() conn.close() logging.info('Data inserted into DB successfully') except Exception as e: logging.error(f'Script failed: {e}') raise # Re-raise for cron to capture if needed logging.info('Scan complete')