Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import praw | |
| import time | |
| from datetime import datetime, timedelta | |
| import json | |
| import os | |
| from typing import List, Dict, Any, Optional, Tuple | |
| import concurrent.futures | |
| from functools import lru_cache | |
| import hashlib | |
| import pytz | |
| import sqlite3 | |
| import networkx as nx | |
| from pathlib import Path | |
| # Advanced features optional - will gracefully degrade if not available | |
| try: | |
| from advanced_reddit_scraper import ( | |
| AdvancedRedditScraper, | |
| ExponentialBackoff, | |
| CommentHierarchyTracker, | |
| CheckpointManager | |
| ) | |
| ADVANCED_FEATURES = True | |
| except ImportError: | |
| ADVANCED_FEATURES = False | |
| def load_env_file(env_path: str = ".env") -> Dict[str, str]: | |
| """ | |
| Load environment variables from .env file | |
| Args: | |
| env_path: Path to .env file | |
| Returns: | |
| Dictionary of environment variables | |
| """ | |
| env_vars = {} | |
| env_file = Path(env_path) | |
| if env_file.exists(): | |
| with open(env_file, 'r') as f: | |
| for line in f: | |
| line = line.strip() | |
| if line and not line.startswith('#') and '=' in line: | |
| key, value = line.split('=', 1) | |
| key = key.strip() | |
| value = value.strip().strip('"').strip("'") | |
| env_vars[key] = value | |
| return env_vars | |
| st.set_page_config( | |
| page_title="Reddit Research Dashboard", | |
| page_icon="π", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| st.markdown(""" | |
| <style> | |
| /* Full width container */ | |
| .main .block-container { | |
| max-width: 100%; | |
| padding-left: 2rem; | |
| padding-right: 2rem; | |
| } | |
| .main-header { | |
| font-size: 2.5rem; | |
| font-weight: bold; | |
| margin-bottom: 1rem; | |
| background: linear-gradient(90deg, #FF4500 0%, #FFA500 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| } | |
| .metric-card { | |
| background-color: #f0f2f6; | |
| padding: 1rem; | |
| border-radius: 0.5rem; | |
| border-left: 3px solid #FF4500; | |
| } | |
| .stProgress > div > div > div > div { | |
| background-color: #FF4500; | |
| } | |
| .info-box { | |
| background-color: #e8f4f8; | |
| padding: 1rem; | |
| border-radius: 0.5rem; | |
| margin: 1rem 0; | |
| border-left: 3px solid #1f77b4; | |
| } | |
| .stream-output { | |
| background-color: #f8f9fa; | |
| padding: 1rem; | |
| border-radius: 0.5rem; | |
| font-family: 'SF Mono', Monaco, monospace; | |
| font-size: 0.85rem; | |
| max-height: 600px; | |
| overflow-y: auto; | |
| margin: 1rem 0; | |
| border: 1px solid #dee2e6; | |
| } | |
| .stream-item { | |
| padding: 0.75rem; | |
| margin: 0.5rem 0; | |
| background: white; | |
| border-radius: 0.25rem; | |
| border-left: 3px solid #FF4500; | |
| box-shadow: 0 1px 3px rgba(0,0,0,0.1); | |
| } | |
| .stream-title { | |
| font-weight: bold; | |
| color: #1a1a1b; | |
| margin-bottom: 0.25rem; | |
| } | |
| .stream-meta { | |
| color: #787c7e; | |
| font-size: 0.8rem; | |
| } | |
| .stream-stats { | |
| background-color: #e8f4f8; | |
| padding: 0.5rem; | |
| border-radius: 0.25rem; | |
| margin: 0.5rem 0; | |
| font-size: 0.9rem; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| class OptimizedRedditScraper: | |
| """ | |
| Optimized Reddit scraper with batch processing, caching, and temporal analytics | |
| """ | |
| def __init__(self, client_id: str, client_secret: str, user_agent: str): | |
| """Initialize with Reddit API credentials""" | |
| self.reddit = praw.Reddit( | |
| client_id=client_id, | |
| client_secret=client_secret, | |
| user_agent=user_agent, | |
| check_for_async=False | |
| ) | |
| self.last_request_time = 0 | |
| self.min_delay = 0.5 | |
| def fetch_subreddit_data_verbose(self, subreddit_name: str, sort_by: str = "hot", | |
| limit: int = 200, time_filter: str = "month", | |
| log_container=None) -> pd.DataFrame: | |
| """ | |
| Fetch Reddit data with verbose logging | |
| Args: | |
| subreddit_name: Name of subreddit to scrape | |
| sort_by: Sort method (hot, new, top, rising) | |
| limit: Number of posts to fetch (optimized for 200+ items) | |
| time_filter: Time filter for top posts | |
| log_container: Streamlit container for logging output | |
| Returns: | |
| DataFrame with Reddit posts data | |
| """ | |
| def stream_post(post_data, stream_container): | |
| """Display a post as it's collected""" | |
| if stream_container: | |
| timestamp = datetime.now().strftime("%H:%M:%S") | |
| with stream_container.container(): | |
| with st.expander(f"π {post_data['title'][:80]}...", expanded=False): | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Score", post_data['score']) | |
| with col2: | |
| st.metric("Comments", post_data['num_comments']) | |
| with col3: | |
| st.text(f"u/{post_data['author']}") | |
| with col4: | |
| st.text(timestamp) | |
| def update_stats(stats_container, total, authors, comments): | |
| """Update collection statistics""" | |
| if stats_container: | |
| stats_container.empty() | |
| with stats_container: | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("π Posts", total) | |
| with col2: | |
| st.metric("π₯ Authors", authors) | |
| with col3: | |
| st.metric("π¬ Comments", f"{comments:,}") | |
| # Initialize streaming containers | |
| stats_container = None | |
| stream_container = None | |
| if log_container: | |
| # Check if log_container is a tuple of (stats, stream) | |
| if isinstance(log_container, tuple): | |
| stats_container, stream_container = log_container | |
| else: | |
| stats_container = log_container | |
| stream_container = log_container | |
| data = [] | |
| try: | |
| subreddit = self.reddit.subreddit(subreddit_name) | |
| # Choose appropriate method based on sort_by | |
| if sort_by == "top": | |
| submissions = subreddit.top(limit=limit, time_filter=time_filter) | |
| elif sort_by == "new": | |
| submissions = subreddit.new(limit=limit) | |
| elif sort_by == "rising": | |
| submissions = subreddit.rising(limit=limit) | |
| else: | |
| submissions = subreddit.hot(limit=limit) | |
| # Batch processing with rate limiting | |
| batch_size = 25 | |
| batch = [] | |
| batch_num = 1 | |
| post_count = 0 | |
| total_comments = 0 | |
| try: | |
| # Convert to list to handle iterator exhaustion gracefully | |
| submissions_list = [] | |
| try: | |
| for submission in submissions: | |
| try: | |
| # Force PRAW to load the submission by accessing an attribute | |
| _ = submission.id | |
| submissions_list.append(submission) | |
| if len(submissions_list) >= limit: | |
| break | |
| except Exception as sub_error: | |
| # Skip submissions that fail to load | |
| continue | |
| except StopIteration: | |
| pass # Iterator exhausted naturally | |
| except Exception as fetch_error: | |
| error_msg = str(fetch_error) | |
| if "Ran out of input" in error_msg or "prawcore" in error_msg.lower(): | |
| # PRAW iterator exhausted - not an error, just end of data | |
| pass | |
| else: | |
| if log_container: | |
| st.warning(f"β οΈ Stopped early: {error_msg}") | |
| if not submissions_list: | |
| if log_container: | |
| st.error(f"No data could be fetched: {error_msg}") | |
| raise | |
| for i, submission in enumerate(submissions_list): | |
| try: | |
| # Rate limiting before fetching submission data | |
| current_time = time.time() | |
| if current_time - self.last_request_time < self.min_delay: | |
| time.sleep(self.min_delay - (current_time - self.last_request_time)) | |
| self.last_request_time = time.time() | |
| batch.append(submission) | |
| post_count += 1 | |
| if len(batch) >= batch_size or post_count >= limit: | |
| # Process batch | |
| for idx, sub in enumerate(batch): | |
| try: | |
| # Safely extract all attributes with error handling | |
| try: | |
| post_id = sub.id | |
| post_title = sub.title | |
| post_author = str(sub.author) if sub.author else '[deleted]' | |
| post_created = datetime.fromtimestamp(sub.created_utc, tz=pytz.UTC) | |
| post_score = sub.score | |
| post_comments = sub.num_comments | |
| post_ratio = sub.upvote_ratio | |
| post_text = sub.selftext[:500] if sub.selftext else '' | |
| post_url = sub.url | |
| post_flair = sub.link_flair_text or 'No Flair' | |
| post_video = sub.is_video | |
| post_self = sub.is_self | |
| post_permalink = f"https://reddit.com{sub.permalink}" | |
| except AttributeError as attr_error: | |
| # Missing attribute - skip this post | |
| continue | |
| except Exception as access_error: | |
| # Any other error accessing attributes - skip | |
| continue | |
| post_data = { | |
| 'id': post_id, | |
| 'title': post_title, | |
| 'author': post_author, | |
| 'created_utc': post_created, | |
| 'score': post_score, | |
| 'num_comments': post_comments, | |
| 'upvote_ratio': post_ratio, | |
| 'selftext': post_text, | |
| 'url': post_url, | |
| 'subreddit': subreddit_name, | |
| 'flair': post_flair, | |
| 'is_video': post_video, | |
| 'is_self': post_self, | |
| 'permalink': post_permalink | |
| } | |
| data.append(post_data) | |
| total_comments += post_data['num_comments'] | |
| # Stream the post to UI | |
| stream_post(post_data, stream_container) | |
| except Exception as post_error: | |
| # Skip posts that cause any error | |
| continue | |
| # Update stats | |
| if log_container: | |
| unique_authors = len(set(d['author'] for d in data)) | |
| update_stats(stats_container, len(data), unique_authors, total_comments) | |
| batch = [] | |
| batch_num += 1 | |
| # Update progress | |
| if st.session_state.get('progress_bar'): | |
| progress = min(post_count / limit, 1.0) | |
| st.session_state.progress_bar.progress(progress) | |
| # Stop if we've reached the limit | |
| if post_count >= limit: | |
| break | |
| except StopIteration: | |
| break | |
| except Exception as iter_error: | |
| continue | |
| # Process any remaining items in batch | |
| if batch: | |
| for idx, sub in enumerate(batch): | |
| try: | |
| # Safely extract all attributes | |
| try: | |
| post_id = sub.id | |
| post_title = sub.title | |
| post_author = str(sub.author) if sub.author else '[deleted]' | |
| post_created = datetime.fromtimestamp(sub.created_utc, tz=pytz.UTC) | |
| post_score = sub.score | |
| post_comments = sub.num_comments | |
| post_ratio = sub.upvote_ratio | |
| post_text = sub.selftext[:500] if sub.selftext else '' | |
| post_url = sub.url | |
| post_flair = sub.link_flair_text or 'No Flair' | |
| post_video = sub.is_video | |
| post_self = sub.is_self | |
| post_permalink = f"https://reddit.com{sub.permalink}" | |
| except Exception: | |
| # Skip posts that fail attribute access | |
| continue | |
| post_data = { | |
| 'id': post_id, | |
| 'title': post_title, | |
| 'author': post_author, | |
| 'created_utc': post_created, | |
| 'score': post_score, | |
| 'num_comments': post_comments, | |
| 'upvote_ratio': post_ratio, | |
| 'selftext': post_text, | |
| 'url': post_url, | |
| 'subreddit': subreddit_name, | |
| 'flair': post_flair, | |
| 'is_video': post_video, | |
| 'is_self': post_self, | |
| 'permalink': post_permalink | |
| } | |
| data.append(post_data) | |
| total_comments += post_data['num_comments'] | |
| stream_post(post_data, stream_container) | |
| except Exception: | |
| # Skip any problematic posts | |
| continue | |
| except StopIteration: | |
| pass | |
| # Final stats update | |
| if log_container: | |
| unique_authors = len(set(d['author'] for d in data)) | |
| update_stats(stats_container, len(data), unique_authors, total_comments) | |
| except Exception as e: | |
| error_msg = str(e) | |
| # Don't show scary errors for common PRAW issues | |
| if "Ran out of input" in error_msg or "prawcore" in error_msg.lower(): | |
| if log_container and len(data) == 0: | |
| st.warning("β οΈ No posts could be fetched. The subreddit may be empty or private.") | |
| else: | |
| if log_container: | |
| st.error(f"β Error: {error_msg}") | |
| if len(data) == 0: # Only raise if we got no data at all | |
| raise | |
| # Return whatever data we managed to collect | |
| if len(data) == 0 and log_container: | |
| st.info("βΉοΈ No posts were collected. Try adjusting your filters or selecting a different subreddit.") | |
| return pd.DataFrame(data) | |
| def fetch_subreddit_data(self, subreddit_name: str, sort_by: str = "hot", | |
| limit: int = 200, time_filter: str = "month") -> pd.DataFrame: | |
| """ | |
| Fetch data with manual session-based caching | |
| """ | |
| # Create cache key | |
| cache_key = f"{subreddit_name}_{sort_by}_{limit}_{time_filter}" | |
| # Check if data exists in session state cache | |
| if 'data_cache' not in st.session_state: | |
| st.session_state.data_cache = {} | |
| if cache_key in st.session_state.data_cache: | |
| cache_entry = st.session_state.data_cache[cache_key] | |
| # Check if cache is still valid (1 hour TTL) | |
| if (datetime.now() - cache_entry['timestamp']).total_seconds() < 3600: | |
| return cache_entry['data'] | |
| # Fetch new data | |
| df = self.fetch_subreddit_data_verbose(subreddit_name, sort_by, limit, time_filter, None) | |
| # Store in cache | |
| st.session_state.data_cache[cache_key] = { | |
| 'data': df, | |
| 'timestamp': datetime.now() | |
| } | |
| return df | |
| def fetch_multiple_subreddits(self, subreddits: List[str], limit_per: int = 100, | |
| sort_by: str = "hot") -> pd.DataFrame: | |
| """ | |
| Fetch data from multiple subreddits with manual caching | |
| Args: | |
| subreddits: List of subreddit names | |
| limit_per: Posts per subreddit | |
| sort_by: Sort method | |
| Returns: | |
| Combined DataFrame | |
| """ | |
| # Create cache key | |
| cache_key = f"multi_{'_'.join(sorted(subreddits))}_{sort_by}_{limit_per}" | |
| # Check cache | |
| if 'data_cache' not in st.session_state: | |
| st.session_state.data_cache = {} | |
| if cache_key in st.session_state.data_cache: | |
| cache_entry = st.session_state.data_cache[cache_key] | |
| # Check if cache is still valid (30 min TTL) | |
| if (datetime.now() - cache_entry['timestamp']).total_seconds() < 1800: | |
| return cache_entry['data'] | |
| # Fetch new data | |
| all_data = [] | |
| with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: | |
| future_to_sub = { | |
| executor.submit(self.fetch_subreddit_data, sub, sort_by, limit_per): sub | |
| for sub in subreddits | |
| } | |
| for future in concurrent.futures.as_completed(future_to_sub): | |
| sub = future_to_sub[future] | |
| try: | |
| data = future.result() | |
| all_data.append(data) | |
| except Exception as e: | |
| st.error(f"Error fetching r/{sub}: {e}") | |
| if all_data: | |
| df = pd.concat(all_data, ignore_index=True) | |
| else: | |
| df = pd.DataFrame() | |
| # Store in cache | |
| st.session_state.data_cache[cache_key] = { | |
| 'data': df, | |
| 'timestamp': datetime.now() | |
| } | |
| return df | |
| def create_temporal_visualizations(df: pd.DataFrame) -> Dict[str, go.Figure]: | |
| """ | |
| Create comprehensive temporal analytics visualizations | |
| Args: | |
| df: DataFrame with Reddit data | |
| Returns: | |
| Dictionary of Plotly figures | |
| """ | |
| figures = {} | |
| # Ensure datetime column | |
| if 'created_utc' in df.columns: | |
| df['created_utc'] = pd.to_datetime(df['created_utc']) | |
| df = df.sort_values('created_utc') | |
| # Get actual date range of collected data with padding | |
| date_min = df['created_utc'].min() | |
| date_max = df['created_utc'].max() | |
| date_range = (date_max - date_min).days | |
| # Add 2% padding to prevent edge clipping | |
| padding = pd.Timedelta(days=max(1, int(date_range * 0.02))) | |
| date_min_padded = date_min - padding | |
| date_max_padded = date_max + padding | |
| # 1. Hourly activity heatmap | |
| df['hour'] = df['created_utc'].dt.hour | |
| df['day_of_week'] = df['created_utc'].dt.day_name() | |
| heatmap_data = df.groupby(['day_of_week', 'hour']).size().reset_index(name='count') | |
| pivot_data = heatmap_data.pivot(index='day_of_week', columns='hour', values='count').fillna(0) | |
| # Reorder days | |
| days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] | |
| pivot_data = pivot_data.reindex(days_order) | |
| fig_heatmap = go.Figure(data=go.Heatmap( | |
| z=pivot_data.values, | |
| x=pivot_data.columns, | |
| y=pivot_data.index, | |
| colorscale='RdYlBu_r', | |
| text=pivot_data.values.astype(int), | |
| texttemplate='%{text}', | |
| textfont={"size": 8}, | |
| hovertemplate='%{y}<br>%{x}:00<br>Posts: %{z}<extra></extra>' | |
| )) | |
| fig_heatmap.update_layout( | |
| title='Activity Heatmap by Hour and Day', | |
| xaxis_title='Hour of Day', | |
| yaxis_title='Day of Week', | |
| height=400 | |
| ) | |
| figures['heatmap'] = fig_heatmap | |
| # 2. Time series with rolling average - only include days with actual data | |
| daily_stats = df.set_index('created_utc').resample('D').agg({ | |
| 'id': 'count', | |
| 'score': 'mean', | |
| 'num_comments': 'mean' | |
| }).rename(columns={'id': 'post_count'}) | |
| # Filter out days with no posts to prevent misleading gaps | |
| daily_stats = daily_stats[daily_stats['post_count'] > 0] | |
| # Calculate rolling averages | |
| daily_stats['post_count_ma7'] = daily_stats['post_count'].rolling(window=7, min_periods=1).mean() | |
| daily_stats['score_ma7'] = daily_stats['score'].rolling(window=7, min_periods=1).mean() | |
| fig_timeline = make_subplots( | |
| rows=2, cols=1, | |
| subplot_titles=('Daily Post Activity', 'Average Engagement Metrics'), | |
| vertical_spacing=0.1 | |
| ) | |
| # Post count | |
| fig_timeline.add_trace( | |
| go.Scatter(x=daily_stats.index, y=daily_stats['post_count'], | |
| mode='markers', name='Daily Posts', opacity=0.5, | |
| marker=dict(size=5, color='lightblue')), | |
| row=1, col=1 | |
| ) | |
| fig_timeline.add_trace( | |
| go.Scatter(x=daily_stats.index, y=daily_stats['post_count_ma7'], | |
| mode='lines', name='7-Day Average', | |
| line=dict(color='blue', width=2), | |
| connectgaps=False), | |
| row=1, col=1 | |
| ) | |
| # Engagement metrics | |
| fig_timeline.add_trace( | |
| go.Scatter(x=daily_stats.index, y=daily_stats['score_ma7'], | |
| mode='lines', name='Avg Score (7-day)', | |
| line=dict(color='orange'), | |
| connectgaps=False), | |
| row=2, col=1 | |
| ) | |
| fig_timeline.add_trace( | |
| go.Scatter(x=daily_stats.index, y=daily_stats['num_comments'].rolling(window=7, min_periods=1).mean(), | |
| mode='lines', name='Avg Comments (7-day)', | |
| line=dict(color='green'), | |
| connectgaps=False), | |
| row=2, col=1 | |
| ) | |
| fig_timeline.update_layout( | |
| height=600, | |
| showlegend=True, | |
| title=f'Activity Timeline ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})', | |
| xaxis=dict(type='date', autorange=True), | |
| xaxis2=dict(type='date', autorange=True) | |
| ) | |
| fig_timeline.update_xaxes(title_text="Date", row=2, col=1) | |
| fig_timeline.update_yaxes(title_text="Count", row=1, col=1) | |
| fig_timeline.update_yaxes(title_text="Value", row=2, col=1) | |
| figures['timeline'] = fig_timeline | |
| # 3. Monthly trend analysis (only show if data spans at least 30 days) | |
| if date_range >= 30: | |
| monthly_data = df.set_index('created_utc').resample('M').agg({ | |
| 'id': 'count', | |
| 'score': ['mean', 'sum'], | |
| 'num_comments': ['mean', 'sum'] | |
| }) | |
| # Filter out months with no posts | |
| monthly_data = monthly_data[monthly_data[('id', 'count')] > 0] | |
| fig_monthly = go.Figure() | |
| fig_monthly.add_trace(go.Bar( | |
| x=monthly_data.index, | |
| y=monthly_data[('id', 'count')], | |
| name='Monthly Posts', | |
| marker_color='lightblue' | |
| )) | |
| fig_monthly.add_trace(go.Scatter( | |
| x=monthly_data.index, | |
| y=monthly_data[('score', 'mean')], | |
| name='Avg Score', | |
| yaxis='y2', | |
| line=dict(color='red', width=2), | |
| connectgaps=False | |
| )) | |
| fig_monthly.update_layout( | |
| title=f'Monthly Posting Trends ({date_min.strftime("%Y-%m")} to {date_max.strftime("%Y-%m")})', | |
| xaxis_title='Month', | |
| xaxis=dict(type='date', autorange=True), | |
| yaxis=dict(title='Post Count', side='left'), | |
| yaxis2=dict(title='Average Score', side='right', overlaying='y'), | |
| height=400, | |
| hovermode='x unified' | |
| ) | |
| figures['monthly'] = fig_monthly | |
| else: | |
| # For shorter periods, show weekly trends instead | |
| weekly_data = df.set_index('created_utc').resample('W').agg({ | |
| 'id': 'count', | |
| 'score': ['mean', 'sum'], | |
| 'num_comments': ['mean', 'sum'] | |
| }) | |
| # Filter out weeks with no posts | |
| weekly_data = weekly_data[weekly_data[('id', 'count')] > 0] | |
| fig_weekly = go.Figure() | |
| fig_weekly.add_trace(go.Bar( | |
| x=weekly_data.index, | |
| y=weekly_data[('id', 'count')], | |
| name='Weekly Posts', | |
| marker_color='lightblue' | |
| )) | |
| fig_weekly.add_trace(go.Scatter( | |
| x=weekly_data.index, | |
| y=weekly_data[('score', 'mean')], | |
| name='Avg Score', | |
| yaxis='y2', | |
| line=dict(color='red', width=2), | |
| connectgaps=False | |
| )) | |
| fig_weekly.update_layout( | |
| title=f'Weekly Posting Trends ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})', | |
| xaxis_title='Week', | |
| xaxis=dict(type='date', autorange=True), | |
| yaxis=dict(title='Post Count', side='left'), | |
| yaxis2=dict(title='Average Score', side='right', overlaying='y'), | |
| height=400, | |
| hovermode='x unified' | |
| ) | |
| figures['monthly'] = fig_weekly # Use same key for consistency | |
| # 4. Posting patterns by flair | |
| if 'flair' in df.columns: | |
| flair_time = df.groupby([pd.Grouper(key='created_utc', freq='W'), 'flair']).size().reset_index(name='count') | |
| top_flairs = df['flair'].value_counts().head(10).index | |
| flair_time_filtered = flair_time[flair_time['flair'].isin(top_flairs)] | |
| fig_flair = px.line(flair_time_filtered, x='created_utc', y='count', | |
| color='flair', | |
| title=f'Weekly Posting Patterns by Flair ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})', | |
| labels={'count': 'Number of Posts', 'created_utc': 'Week'}) | |
| fig_flair.update_layout( | |
| height=400, | |
| xaxis=dict(type='date', autorange=True) | |
| ) | |
| figures['flair'] = fig_flair | |
| return figures | |
| def create_engagement_analytics(df: pd.DataFrame) -> Dict[str, go.Figure]: | |
| """ | |
| Create engagement and interaction analytics | |
| Args: | |
| df: DataFrame with Reddit data | |
| Returns: | |
| Dictionary of engagement figures | |
| """ | |
| figures = {} | |
| # 1. Score distribution | |
| fig_score_dist = go.Figure() | |
| fig_score_dist.add_trace(go.Histogram( | |
| x=df['score'], | |
| nbinsx=50, | |
| name='Score Distribution', | |
| marker_color='orange' | |
| )) | |
| fig_score_dist.update_layout( | |
| title='Post Score Distribution', | |
| xaxis_title='Score', | |
| yaxis_title='Frequency', | |
| height=350 | |
| ) | |
| figures['score_dist'] = fig_score_dist | |
| # 2. Engagement correlation | |
| fig_correlation = px.scatter( | |
| df, x='score', y='num_comments', | |
| size='upvote_ratio', color='is_self', | |
| title='Score vs Comments Correlation', | |
| labels={'score': 'Post Score', 'num_comments': 'Number of Comments', | |
| 'is_self': 'Post Type', 'upvote_ratio': 'Upvote Ratio'}, | |
| hover_data=['title'] | |
| ) | |
| fig_correlation.update_layout(height=400) | |
| figures['correlation'] = fig_correlation | |
| # 3. Top performing posts | |
| top_posts = df.nlargest(10, 'score')[['title', 'score', 'num_comments', 'author']] | |
| fig_top = go.Figure(data=[ | |
| go.Bar(name='Score', x=top_posts['title'].str[:30] + '...', y=top_posts['score']), | |
| go.Bar(name='Comments', x=top_posts['title'].str[:30] + '...', y=top_posts['num_comments']) | |
| ]) | |
| fig_top.update_layout( | |
| title='Top 10 Posts by Engagement', | |
| barmode='group', | |
| height=400, | |
| xaxis_tickangle=-45 | |
| ) | |
| figures['top_posts'] = fig_top | |
| return figures | |
| def main(): | |
| """Main application function""" | |
| # Initialize session state | |
| if 'scraper' not in st.session_state: | |
| st.session_state.scraper = None | |
| if 'advanced_scraper' not in st.session_state: | |
| st.session_state.advanced_scraper = None | |
| if 'data' not in st.session_state: | |
| st.session_state.data = pd.DataFrame() | |
| if 'last_fetch' not in st.session_state: | |
| st.session_state.last_fetch = None | |
| if 'comment_hierarchies' not in st.session_state: | |
| st.session_state.comment_hierarchies = {} | |
| # Header | |
| st.markdown('<h1 class="main-header">π Reddit Research Dashboard</h1>', unsafe_allow_html=True) | |
| st.markdown("Optimized for high-volume data collection and temporal analytics") | |
| # Load .env file if it exists | |
| env_vars = load_env_file(".env") | |
| if not env_vars: | |
| # Try parent directory | |
| env_vars = load_env_file("../.env") | |
| # Sidebar configuration | |
| with st.sidebar: | |
| st.header("βοΈ Configuration") | |
| # Show .env detection status | |
| if env_vars: | |
| st.success("β .env file detected and loaded") | |
| # API Credentials - prioritize .env, then environment variables | |
| with st.expander("π Reddit API Credentials", expanded=not bool(env_vars)): | |
| default_client_id = env_vars.get("REDDIT_CLIENT_ID", os.environ.get("REDDIT_CLIENT_ID", "")) | |
| default_client_secret = env_vars.get("REDDIT_CLIENT_SECRET", os.environ.get("REDDIT_CLIENT_SECRET", "")) | |
| default_user_agent = env_vars.get("REDDIT_USER_AGENT", os.environ.get("REDDIT_USER_AGENT", "RedditResearch/1.0")) | |
| client_id = st.text_input( | |
| "Client ID", | |
| value=default_client_id, | |
| type="password", | |
| help="Your Reddit API client ID (auto-populated from .env if available)" | |
| ) | |
| client_secret = st.text_input( | |
| "Client Secret", | |
| value=default_client_secret, | |
| type="password", | |
| help="Your Reddit API client secret (auto-populated from .env if available)" | |
| ) | |
| user_agent = st.text_input( | |
| "User Agent", | |
| value=default_user_agent, | |
| help="User agent string for API requests (auto-populated from .env if available)" | |
| ) | |
| if st.button("Initialize Scraper", type="primary"): | |
| if client_id and client_secret: | |
| try: | |
| st.session_state.scraper = OptimizedRedditScraper( | |
| client_id, client_secret, user_agent | |
| ) | |
| if ADVANCED_FEATURES: | |
| st.session_state.advanced_scraper = AdvancedRedditScraper( | |
| client_id, client_secret, user_agent | |
| ) | |
| st.success("β Scrapers initialized successfully (with advanced features)!") | |
| else: | |
| st.success("β Scraper initialized successfully!") | |
| except Exception as e: | |
| st.error(f"β Failed to initialize: {e}") | |
| else: | |
| st.warning("β οΈ Please provide API credentials") | |
| # Data Collection Settings | |
| st.header("π₯ Data Collection") | |
| # Show advanced mode only if features are available | |
| if ADVANCED_FEATURES: | |
| collection_mode = st.radio( | |
| "Collection Mode", | |
| ["Single Subreddit", "Multiple Subreddits", "Advanced with Hierarchy"] | |
| ) | |
| else: | |
| collection_mode = st.radio( | |
| "Collection Mode", | |
| ["Single Subreddit", "Multiple Subreddits"] | |
| ) | |
| if collection_mode == "Single Subreddit": | |
| subreddit_name = st.text_input("Subreddit Name", value="CUNY") | |
| subreddits = [subreddit_name] | |
| elif collection_mode == "Multiple Subreddits": | |
| subreddit_input = st.text_area( | |
| "Subreddits (one per line)", | |
| value="CUNY\nBaruch\nHunterCollege", | |
| height=100 | |
| ) | |
| subreddits = [s.strip() for s in subreddit_input.split('\n') if s.strip()] | |
| else: | |
| # Advanced with Hierarchy (only if ADVANCED_FEATURES is True) | |
| subreddit_name = st.text_input("Subreddit Name", value="CUNY") | |
| subreddits = [subreddit_name] | |
| use_checkpoint = st.checkbox("Enable checkpoint/resume", value=True) | |
| if use_checkpoint: | |
| checkpoint_name = st.text_input("Checkpoint name", value=f"{subreddit_name}_checkpoint") | |
| # Advanced settings | |
| with st.expander("βοΈ Advanced Settings"): | |
| sort_by = st.selectbox( | |
| "Sort By", | |
| ["hot", "new", "top", "rising"], | |
| help="How to sort posts" | |
| ) | |
| limit = st.slider( | |
| "Posts per Subreddit", | |
| min_value=50, | |
| max_value=500, | |
| value=200, | |
| step=50, | |
| help="Number of posts to fetch (optimized for 200+)" | |
| ) | |
| if sort_by == "top": | |
| time_filter = st.selectbox( | |
| "Time Filter", | |
| ["hour", "day", "week", "month", "year", "all"], | |
| index=3 | |
| ) | |
| else: | |
| time_filter = "month" | |
| batch_size = st.number_input( | |
| "Batch Size", | |
| min_value=10, | |
| max_value=50, | |
| value=25, | |
| help="Posts processed per batch" | |
| ) | |
| cache_ttl = st.number_input( | |
| "Cache Duration (minutes)", | |
| min_value=5, | |
| max_value=120, | |
| value=60, | |
| help="How long to cache results" | |
| ) | |
| # Main content area with tabs | |
| if st.session_state.scraper: | |
| # Create main tabs | |
| main_tab1, main_tab2 = st.tabs(["π₯ Live Collection", "π Analytics & Metrics"]) | |
| with main_tab1: | |
| st.header("Live Data Collection") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if st.button("π Start Collection", type="primary", width="stretch"): | |
| # Initialize/clear stream posts | |
| st.session_state.stream_posts = [] | |
| # Create display containers | |
| status_text = st.empty() | |
| progress_bar = st.progress(0) | |
| st.session_state.progress_bar = progress_bar | |
| # Fixed position containers for stats and stream | |
| stats_placeholder = st.empty() | |
| stream_placeholder = st.empty() | |
| status_text.info("π Starting collection...") | |
| try: | |
| if collection_mode == "Advanced with Hierarchy": | |
| # Advanced scraping with comment hierarchies | |
| status_text.info(f"Advanced scraping r/{subreddits[0]}...") | |
| checkpoint = checkpoint_name if use_checkpoint else None | |
| results = st.session_state.advanced_scraper.scrape_with_hierarchy( | |
| subreddits[0], limit=limit, checkpoint_name=checkpoint | |
| ) | |
| st.session_state.comment_hierarchies = results['hierarchies'] | |
| # Convert to DataFrame | |
| df = pd.DataFrame(results['submissions']) | |
| if df.empty: | |
| df = pd.DataFrame() | |
| else: | |
| df['created_utc'] = pd.to_datetime(df['created_utc']) | |
| st.session_state.data = df | |
| status_text.success(f"β Scraped {len(results['submissions'])} posts with {len(results['comments'])} comments!") | |
| elif len(subreddits) == 1: | |
| # Standard single subreddit with streaming | |
| status_text.info(f"Collecting from r/{subreddits[0]}...") | |
| # Show header for stats | |
| with stats_placeholder: | |
| st.subheader("π Live Collection Progress") | |
| # Pass the placeholders to the scraper | |
| df = st.session_state.scraper.fetch_subreddit_data_verbose( | |
| subreddits[0], sort_by, limit, time_filter, | |
| (stats_placeholder, stream_placeholder) | |
| ) | |
| st.session_state.data = df if not df.empty else pd.DataFrame() | |
| if len(df) > 0: | |
| status_text.success(f"β Collected {len(df)} posts!") | |
| else: | |
| status_text.warning("β οΈ No posts collected") | |
| else: | |
| # Multiple subreddits with streaming | |
| status_text.info(f"Collecting from {len(subreddits)} subreddits...") | |
| with stats_placeholder: | |
| st.subheader("π Live Collection Progress") | |
| all_data = [] | |
| for idx, sub in enumerate(subreddits): | |
| status_text.info(f"Collecting {idx+1}/{len(subreddits)}: r/{sub}...") | |
| df = st.session_state.scraper.fetch_subreddit_data_verbose( | |
| sub, sort_by, limit, time_filter, | |
| (stats_placeholder, stream_placeholder) | |
| ) | |
| all_data.append(df) | |
| if all_data: | |
| df = pd.concat(all_data, ignore_index=True) | |
| else: | |
| df = pd.DataFrame() | |
| st.session_state.data = df | |
| status_text.success(f"β Collected {len(df)} total posts!") | |
| st.session_state.last_fetch = datetime.now() | |
| except Exception as e: | |
| error_msg = str(e) | |
| # Don't show PRAW iterator exhaustion errors | |
| if "Ran out of input" not in error_msg and "prawcore" not in error_msg.lower(): | |
| status_text.error(f"β Error: {error_msg}") | |
| elif st.session_state.data.empty: | |
| status_text.warning("β οΈ No posts could be fetched. Try adjusting your filters.") | |
| with col2: | |
| if not st.session_state.data.empty: | |
| st.download_button( | |
| "π₯ Download CSV", | |
| st.session_state.data.to_csv(index=False), | |
| file_name=f"reddit_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", | |
| mime="text/csv", | |
| width="stretch" | |
| ) | |
| with col3: | |
| if st.session_state.last_fetch: | |
| st.info(f"Last: {st.session_state.last_fetch.strftime('%H:%M:%S')}") | |
| # Analytics & Metrics Tab | |
| with main_tab2: | |
| if not st.session_state.data.empty: | |
| df = st.session_state.data | |
| # Summary metrics at top | |
| st.header("π Summary Metrics") | |
| col1, col2, col3, col4, col5 = st.columns(5) | |
| with col1: | |
| st.metric("Total Posts", f"{len(df):,}") | |
| with col2: | |
| st.metric("Unique Authors", f"{df['author'].nunique():,}") | |
| with col3: | |
| st.metric("Avg Score", f"{df['score'].mean():.1f}") | |
| with col4: | |
| st.metric("Avg Comments", f"{df['num_comments'].mean():.1f}") | |
| with col5: | |
| st.metric("Subreddits", len(df['subreddit'].unique())) | |
| # Tabbed interface for different analyses | |
| tab1, tab2, tab3, tab4, tab5 = st.tabs([ | |
| "π Temporal Analytics", | |
| "π¬ Engagement Analysis", | |
| "π Raw Data", | |
| "π Search & Filter", | |
| "π³ Comment Hierarchies" | |
| ]) | |
| with tab1: | |
| st.header("Temporal Analytics") | |
| # Generate temporal visualizations | |
| temporal_figs = create_temporal_visualizations(df) | |
| # Activity heatmap | |
| st.plotly_chart(temporal_figs.get('heatmap'), use_container_width=True) | |
| # Time series | |
| st.plotly_chart(temporal_figs.get('timeline'), use_container_width=True) | |
| # Monthly trends | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.plotly_chart(temporal_figs.get('monthly'), use_container_width=True) | |
| with col2: | |
| if 'flair' in temporal_figs: | |
| st.plotly_chart(temporal_figs.get('flair'), use_container_width=True) | |
| with tab2: | |
| st.header("Engagement Analysis") | |
| engagement_figs = create_engagement_analytics(df) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.plotly_chart(engagement_figs['score_dist'], use_container_width=True) | |
| with col2: | |
| st.plotly_chart(engagement_figs['correlation'], use_container_width=True) | |
| st.plotly_chart(engagement_figs['top_posts'], use_container_width=True) | |
| with tab3: | |
| st.header("Raw Data View") | |
| # Data filtering options | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| min_score = st.number_input("Min Score", value=0) | |
| with col2: | |
| min_comments = st.number_input("Min Comments", value=0) | |
| with col3: | |
| author_filter = st.text_input("Author Filter") | |
| # Apply filters | |
| filtered_df = df[ | |
| (df['score'] >= min_score) & | |
| (df['num_comments'] >= min_comments) | |
| ] | |
| if author_filter: | |
| filtered_df = filtered_df[ | |
| filtered_df['author'].str.contains(author_filter, case=False, na=False) | |
| ] | |
| st.dataframe( | |
| filtered_df[['title', 'author', 'score', 'num_comments', | |
| 'created_utc', 'subreddit', 'flair']], | |
| width="stretch", | |
| height=500 | |
| ) | |
| st.info(f"Showing {len(filtered_df)} of {len(df)} posts") | |
| with tab4: | |
| st.header("Search & Filter") | |
| search_query = st.text_input("Search in titles and text", placeholder="Enter keywords...") | |
| if search_query: | |
| mask = ( | |
| df['title'].str.contains(search_query, case=False, na=False) | | |
| df['selftext'].str.contains(search_query, case=False, na=False) | |
| ) | |
| search_results = df[mask] | |
| st.info(f"Found {len(search_results)} posts matching '{search_query}'") | |
| if not search_results.empty: | |
| for idx, row in search_results.head(10).iterrows(): | |
| with st.expander(f"π {row['title'][:100]}..."): | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Score", row['score']) | |
| with col2: | |
| st.metric("Comments", row['num_comments']) | |
| with col3: | |
| st.metric("Date", row['created_utc'].strftime('%Y-%m-%d')) | |
| st.write(f"**Author:** u/{row['author']}") | |
| st.write(f"**Subreddit:** r/{row['subreddit']}") | |
| if row['selftext']: | |
| st.write(f"**Text:** {row['selftext'][:500]}...") | |
| st.write(f"[View on Reddit]({row['permalink']})") | |
| with tab5: | |
| st.header("Comment Hierarchies") | |
| if not ADVANCED_FEATURES: | |
| st.info("β οΈ Comment hierarchy analysis requires additional dependencies. This feature is optional and not needed for basic data collection.") | |
| elif st.session_state.comment_hierarchies: | |
| # Select submission to view | |
| submission_ids = list(st.session_state.comment_hierarchies.keys()) | |
| selected_sub = st.selectbox("Select Submission", submission_ids) | |
| if selected_sub: | |
| hierarchy = st.session_state.comment_hierarchies[selected_sub] | |
| # Display submission info | |
| if hierarchy['submission']: | |
| st.subheader(f"π {hierarchy['submission'].get('title', 'No Title')}") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Score", hierarchy['submission'].get('score', 0)) | |
| with col2: | |
| st.metric("Comments", len(hierarchy.get('comments', {}))) | |
| with col3: | |
| st.metric("Author", hierarchy['submission'].get('author', '[deleted]')) | |
| # Visualize comment tree | |
| def display_comment_tree(comments, level=0): | |
| for comment_id, comment in comments.items(): | |
| indent = " " * level | |
| with st.expander(f"{indent}π¬ {comment.get('author', '[deleted]')} - Score: {comment.get('score', 0)}"): | |
| st.write(comment.get('body', '')[:500]) | |
| if 'replies' in comment and comment['replies']: | |
| st.write("**Replies:**") | |
| display_comment_tree(comment['replies'], level + 1) | |
| st.subheader("Comment Thread Structure") | |
| if hierarchy.get('hierarchy'): | |
| display_comment_tree(hierarchy['hierarchy']) | |
| else: | |
| st.info("No comments found for this submission") | |
| # Orphan statistics | |
| if st.session_state.get('advanced_scraper'): | |
| orphan_stats = st.session_state.advanced_scraper.hierarchy_tracker.get_orphan_statistics() | |
| if orphan_stats['orphaned_count'] > 0: | |
| st.warning(f"β οΈ {orphan_stats['orphaned_count']} orphaned comments detected ({orphan_stats['orphan_rate']:.1%} orphan rate)") | |
| else: | |
| st.info("Use 'Advanced with Hierarchy' collection mode to analyze comment structures") | |
| else: | |
| # Empty state - no data collected yet | |
| st.info("π Configure your settings in the sidebar and click 'Start Collection' to begin") | |
| # Quick start guide | |
| with st.expander("π Quick Start Guide"): | |
| st.markdown(""" | |
| ### Getting Started | |
| 1. **Set up API Credentials** | |
| - Get your Reddit API credentials from [reddit.com/prefs/apps](https://www.reddit.com/prefs/apps) | |
| - Enter them in the sidebar | |
| - Click "Initialize Scraper" | |
| 2. **Choose Collection Mode** | |
| - **Single Subreddit**: Analyze one community in depth | |
| - **Multiple Subreddits**: Collect from multiple communities | |
| 3. **Configure Settings** | |
| - Adjust the number of posts (200+ recommended) | |
| - Choose sort method (hot, new, top, rising) | |
| - Set time filter for top posts | |
| 4. **Fetch & Analyze** | |
| - Click "Fetch Data" to start collection | |
| - Explore temporal patterns, engagement metrics | |
| - Export results as CSV for further analysis | |
| ### Features | |
| - **Batch Processing**: Efficiently handles 200+ posts | |
| - **Caching**: Reduces API calls with smart caching | |
| - **Temporal Analytics**: Hour/day/month patterns | |
| - **Engagement Metrics**: Score, comments, correlations | |
| """) | |
| else: | |
| st.warning("β οΈ Please initialize the scraper with your Reddit API credentials in the sidebar") | |
| # API setup instructions | |
| with st.expander("π How to get Reddit API credentials"): | |
| st.markdown(""" | |
| ### Setting up Reddit API Access | |
| 1. **Create a Reddit Account** (if you don't have one) | |
| - Go to [reddit.com](https://www.reddit.com) and sign up | |
| 2. **Create an App** | |
| - Visit [reddit.com/prefs/apps](https://www.reddit.com/prefs/apps) | |
| - Click "Create App" or "Create Another App" | |
| - Fill in the form: | |
| - **Name**: Your app name (e.g., "Research Dashboard") | |
| - **App Type**: Select "script" | |
| - **Description**: Optional | |
| - **About URL**: Leave blank | |
| - **Redirect URI**: http://localhost:8000 | |
| - Click "Create app" | |
| 3. **Get Your Credentials** | |
| - **Client ID**: The string under "personal use script" | |
| - **Client Secret**: The secret key shown | |
| - **User Agent**: Format: "Platform:AppName:Version (by /u/YourUsername)" | |
| 4. **Enter in Sidebar** | |
| - Copy your credentials to the sidebar fields | |
| - Click "Initialize Scraper" | |
| """) | |
| if __name__ == "__main__": | |
| main() |