reddit-dashboard / src /streamlit_app.py
milwright's picture
add connectgaps=false to prevent misleading lines across data gaps
dd09fee
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import praw
import time
from datetime import datetime, timedelta
import json
import os
from typing import List, Dict, Any, Optional, Tuple
import concurrent.futures
from functools import lru_cache
import hashlib
import pytz
import sqlite3
import networkx as nx
from pathlib import Path
# Advanced features optional - will gracefully degrade if not available
try:
from advanced_reddit_scraper import (
AdvancedRedditScraper,
ExponentialBackoff,
CommentHierarchyTracker,
CheckpointManager
)
ADVANCED_FEATURES = True
except ImportError:
ADVANCED_FEATURES = False
def load_env_file(env_path: str = ".env") -> Dict[str, str]:
"""
Load environment variables from .env file
Args:
env_path: Path to .env file
Returns:
Dictionary of environment variables
"""
env_vars = {}
env_file = Path(env_path)
if env_file.exists():
with open(env_file, 'r') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, value = line.split('=', 1)
key = key.strip()
value = value.strip().strip('"').strip("'")
env_vars[key] = value
return env_vars
st.set_page_config(
page_title="Reddit Research Dashboard",
page_icon="πŸ“Š",
layout="wide",
initial_sidebar_state="expanded"
)
st.markdown("""
<style>
/* Full width container */
.main .block-container {
max-width: 100%;
padding-left: 2rem;
padding-right: 2rem;
}
.main-header {
font-size: 2.5rem;
font-weight: bold;
margin-bottom: 1rem;
background: linear-gradient(90deg, #FF4500 0%, #FFA500 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.metric-card {
background-color: #f0f2f6;
padding: 1rem;
border-radius: 0.5rem;
border-left: 3px solid #FF4500;
}
.stProgress > div > div > div > div {
background-color: #FF4500;
}
.info-box {
background-color: #e8f4f8;
padding: 1rem;
border-radius: 0.5rem;
margin: 1rem 0;
border-left: 3px solid #1f77b4;
}
.stream-output {
background-color: #f8f9fa;
padding: 1rem;
border-radius: 0.5rem;
font-family: 'SF Mono', Monaco, monospace;
font-size: 0.85rem;
max-height: 600px;
overflow-y: auto;
margin: 1rem 0;
border: 1px solid #dee2e6;
}
.stream-item {
padding: 0.75rem;
margin: 0.5rem 0;
background: white;
border-radius: 0.25rem;
border-left: 3px solid #FF4500;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
}
.stream-title {
font-weight: bold;
color: #1a1a1b;
margin-bottom: 0.25rem;
}
.stream-meta {
color: #787c7e;
font-size: 0.8rem;
}
.stream-stats {
background-color: #e8f4f8;
padding: 0.5rem;
border-radius: 0.25rem;
margin: 0.5rem 0;
font-size: 0.9rem;
}
</style>
""", unsafe_allow_html=True)
class OptimizedRedditScraper:
"""
Optimized Reddit scraper with batch processing, caching, and temporal analytics
"""
def __init__(self, client_id: str, client_secret: str, user_agent: str):
"""Initialize with Reddit API credentials"""
self.reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent,
check_for_async=False
)
self.last_request_time = 0
self.min_delay = 0.5
def fetch_subreddit_data_verbose(self, subreddit_name: str, sort_by: str = "hot",
limit: int = 200, time_filter: str = "month",
log_container=None) -> pd.DataFrame:
"""
Fetch Reddit data with verbose logging
Args:
subreddit_name: Name of subreddit to scrape
sort_by: Sort method (hot, new, top, rising)
limit: Number of posts to fetch (optimized for 200+ items)
time_filter: Time filter for top posts
log_container: Streamlit container for logging output
Returns:
DataFrame with Reddit posts data
"""
def stream_post(post_data, stream_container):
"""Display a post as it's collected"""
if stream_container:
timestamp = datetime.now().strftime("%H:%M:%S")
with stream_container.container():
with st.expander(f"πŸ“ {post_data['title'][:80]}...", expanded=False):
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Score", post_data['score'])
with col2:
st.metric("Comments", post_data['num_comments'])
with col3:
st.text(f"u/{post_data['author']}")
with col4:
st.text(timestamp)
def update_stats(stats_container, total, authors, comments):
"""Update collection statistics"""
if stats_container:
stats_container.empty()
with stats_container:
col1, col2, col3 = st.columns(3)
with col1:
st.metric("πŸ“Š Posts", total)
with col2:
st.metric("πŸ‘₯ Authors", authors)
with col3:
st.metric("πŸ’¬ Comments", f"{comments:,}")
# Initialize streaming containers
stats_container = None
stream_container = None
if log_container:
# Check if log_container is a tuple of (stats, stream)
if isinstance(log_container, tuple):
stats_container, stream_container = log_container
else:
stats_container = log_container
stream_container = log_container
data = []
try:
subreddit = self.reddit.subreddit(subreddit_name)
# Choose appropriate method based on sort_by
if sort_by == "top":
submissions = subreddit.top(limit=limit, time_filter=time_filter)
elif sort_by == "new":
submissions = subreddit.new(limit=limit)
elif sort_by == "rising":
submissions = subreddit.rising(limit=limit)
else:
submissions = subreddit.hot(limit=limit)
# Batch processing with rate limiting
batch_size = 25
batch = []
batch_num = 1
post_count = 0
total_comments = 0
try:
# Convert to list to handle iterator exhaustion gracefully
submissions_list = []
try:
for submission in submissions:
try:
# Force PRAW to load the submission by accessing an attribute
_ = submission.id
submissions_list.append(submission)
if len(submissions_list) >= limit:
break
except Exception as sub_error:
# Skip submissions that fail to load
continue
except StopIteration:
pass # Iterator exhausted naturally
except Exception as fetch_error:
error_msg = str(fetch_error)
if "Ran out of input" in error_msg or "prawcore" in error_msg.lower():
# PRAW iterator exhausted - not an error, just end of data
pass
else:
if log_container:
st.warning(f"⚠️ Stopped early: {error_msg}")
if not submissions_list:
if log_container:
st.error(f"No data could be fetched: {error_msg}")
raise
for i, submission in enumerate(submissions_list):
try:
# Rate limiting before fetching submission data
current_time = time.time()
if current_time - self.last_request_time < self.min_delay:
time.sleep(self.min_delay - (current_time - self.last_request_time))
self.last_request_time = time.time()
batch.append(submission)
post_count += 1
if len(batch) >= batch_size or post_count >= limit:
# Process batch
for idx, sub in enumerate(batch):
try:
# Safely extract all attributes with error handling
try:
post_id = sub.id
post_title = sub.title
post_author = str(sub.author) if sub.author else '[deleted]'
post_created = datetime.fromtimestamp(sub.created_utc, tz=pytz.UTC)
post_score = sub.score
post_comments = sub.num_comments
post_ratio = sub.upvote_ratio
post_text = sub.selftext[:500] if sub.selftext else ''
post_url = sub.url
post_flair = sub.link_flair_text or 'No Flair'
post_video = sub.is_video
post_self = sub.is_self
post_permalink = f"https://reddit.com{sub.permalink}"
except AttributeError as attr_error:
# Missing attribute - skip this post
continue
except Exception as access_error:
# Any other error accessing attributes - skip
continue
post_data = {
'id': post_id,
'title': post_title,
'author': post_author,
'created_utc': post_created,
'score': post_score,
'num_comments': post_comments,
'upvote_ratio': post_ratio,
'selftext': post_text,
'url': post_url,
'subreddit': subreddit_name,
'flair': post_flair,
'is_video': post_video,
'is_self': post_self,
'permalink': post_permalink
}
data.append(post_data)
total_comments += post_data['num_comments']
# Stream the post to UI
stream_post(post_data, stream_container)
except Exception as post_error:
# Skip posts that cause any error
continue
# Update stats
if log_container:
unique_authors = len(set(d['author'] for d in data))
update_stats(stats_container, len(data), unique_authors, total_comments)
batch = []
batch_num += 1
# Update progress
if st.session_state.get('progress_bar'):
progress = min(post_count / limit, 1.0)
st.session_state.progress_bar.progress(progress)
# Stop if we've reached the limit
if post_count >= limit:
break
except StopIteration:
break
except Exception as iter_error:
continue
# Process any remaining items in batch
if batch:
for idx, sub in enumerate(batch):
try:
# Safely extract all attributes
try:
post_id = sub.id
post_title = sub.title
post_author = str(sub.author) if sub.author else '[deleted]'
post_created = datetime.fromtimestamp(sub.created_utc, tz=pytz.UTC)
post_score = sub.score
post_comments = sub.num_comments
post_ratio = sub.upvote_ratio
post_text = sub.selftext[:500] if sub.selftext else ''
post_url = sub.url
post_flair = sub.link_flair_text or 'No Flair'
post_video = sub.is_video
post_self = sub.is_self
post_permalink = f"https://reddit.com{sub.permalink}"
except Exception:
# Skip posts that fail attribute access
continue
post_data = {
'id': post_id,
'title': post_title,
'author': post_author,
'created_utc': post_created,
'score': post_score,
'num_comments': post_comments,
'upvote_ratio': post_ratio,
'selftext': post_text,
'url': post_url,
'subreddit': subreddit_name,
'flair': post_flair,
'is_video': post_video,
'is_self': post_self,
'permalink': post_permalink
}
data.append(post_data)
total_comments += post_data['num_comments']
stream_post(post_data, stream_container)
except Exception:
# Skip any problematic posts
continue
except StopIteration:
pass
# Final stats update
if log_container:
unique_authors = len(set(d['author'] for d in data))
update_stats(stats_container, len(data), unique_authors, total_comments)
except Exception as e:
error_msg = str(e)
# Don't show scary errors for common PRAW issues
if "Ran out of input" in error_msg or "prawcore" in error_msg.lower():
if log_container and len(data) == 0:
st.warning("⚠️ No posts could be fetched. The subreddit may be empty or private.")
else:
if log_container:
st.error(f"❌ Error: {error_msg}")
if len(data) == 0: # Only raise if we got no data at all
raise
# Return whatever data we managed to collect
if len(data) == 0 and log_container:
st.info("ℹ️ No posts were collected. Try adjusting your filters or selecting a different subreddit.")
return pd.DataFrame(data)
def fetch_subreddit_data(self, subreddit_name: str, sort_by: str = "hot",
limit: int = 200, time_filter: str = "month") -> pd.DataFrame:
"""
Fetch data with manual session-based caching
"""
# Create cache key
cache_key = f"{subreddit_name}_{sort_by}_{limit}_{time_filter}"
# Check if data exists in session state cache
if 'data_cache' not in st.session_state:
st.session_state.data_cache = {}
if cache_key in st.session_state.data_cache:
cache_entry = st.session_state.data_cache[cache_key]
# Check if cache is still valid (1 hour TTL)
if (datetime.now() - cache_entry['timestamp']).total_seconds() < 3600:
return cache_entry['data']
# Fetch new data
df = self.fetch_subreddit_data_verbose(subreddit_name, sort_by, limit, time_filter, None)
# Store in cache
st.session_state.data_cache[cache_key] = {
'data': df,
'timestamp': datetime.now()
}
return df
def fetch_multiple_subreddits(self, subreddits: List[str], limit_per: int = 100,
sort_by: str = "hot") -> pd.DataFrame:
"""
Fetch data from multiple subreddits with manual caching
Args:
subreddits: List of subreddit names
limit_per: Posts per subreddit
sort_by: Sort method
Returns:
Combined DataFrame
"""
# Create cache key
cache_key = f"multi_{'_'.join(sorted(subreddits))}_{sort_by}_{limit_per}"
# Check cache
if 'data_cache' not in st.session_state:
st.session_state.data_cache = {}
if cache_key in st.session_state.data_cache:
cache_entry = st.session_state.data_cache[cache_key]
# Check if cache is still valid (30 min TTL)
if (datetime.now() - cache_entry['timestamp']).total_seconds() < 1800:
return cache_entry['data']
# Fetch new data
all_data = []
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
future_to_sub = {
executor.submit(self.fetch_subreddit_data, sub, sort_by, limit_per): sub
for sub in subreddits
}
for future in concurrent.futures.as_completed(future_to_sub):
sub = future_to_sub[future]
try:
data = future.result()
all_data.append(data)
except Exception as e:
st.error(f"Error fetching r/{sub}: {e}")
if all_data:
df = pd.concat(all_data, ignore_index=True)
else:
df = pd.DataFrame()
# Store in cache
st.session_state.data_cache[cache_key] = {
'data': df,
'timestamp': datetime.now()
}
return df
def create_temporal_visualizations(df: pd.DataFrame) -> Dict[str, go.Figure]:
"""
Create comprehensive temporal analytics visualizations
Args:
df: DataFrame with Reddit data
Returns:
Dictionary of Plotly figures
"""
figures = {}
# Ensure datetime column
if 'created_utc' in df.columns:
df['created_utc'] = pd.to_datetime(df['created_utc'])
df = df.sort_values('created_utc')
# Get actual date range of collected data with padding
date_min = df['created_utc'].min()
date_max = df['created_utc'].max()
date_range = (date_max - date_min).days
# Add 2% padding to prevent edge clipping
padding = pd.Timedelta(days=max(1, int(date_range * 0.02)))
date_min_padded = date_min - padding
date_max_padded = date_max + padding
# 1. Hourly activity heatmap
df['hour'] = df['created_utc'].dt.hour
df['day_of_week'] = df['created_utc'].dt.day_name()
heatmap_data = df.groupby(['day_of_week', 'hour']).size().reset_index(name='count')
pivot_data = heatmap_data.pivot(index='day_of_week', columns='hour', values='count').fillna(0)
# Reorder days
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
pivot_data = pivot_data.reindex(days_order)
fig_heatmap = go.Figure(data=go.Heatmap(
z=pivot_data.values,
x=pivot_data.columns,
y=pivot_data.index,
colorscale='RdYlBu_r',
text=pivot_data.values.astype(int),
texttemplate='%{text}',
textfont={"size": 8},
hovertemplate='%{y}<br>%{x}:00<br>Posts: %{z}<extra></extra>'
))
fig_heatmap.update_layout(
title='Activity Heatmap by Hour and Day',
xaxis_title='Hour of Day',
yaxis_title='Day of Week',
height=400
)
figures['heatmap'] = fig_heatmap
# 2. Time series with rolling average - only include days with actual data
daily_stats = df.set_index('created_utc').resample('D').agg({
'id': 'count',
'score': 'mean',
'num_comments': 'mean'
}).rename(columns={'id': 'post_count'})
# Filter out days with no posts to prevent misleading gaps
daily_stats = daily_stats[daily_stats['post_count'] > 0]
# Calculate rolling averages
daily_stats['post_count_ma7'] = daily_stats['post_count'].rolling(window=7, min_periods=1).mean()
daily_stats['score_ma7'] = daily_stats['score'].rolling(window=7, min_periods=1).mean()
fig_timeline = make_subplots(
rows=2, cols=1,
subplot_titles=('Daily Post Activity', 'Average Engagement Metrics'),
vertical_spacing=0.1
)
# Post count
fig_timeline.add_trace(
go.Scatter(x=daily_stats.index, y=daily_stats['post_count'],
mode='markers', name='Daily Posts', opacity=0.5,
marker=dict(size=5, color='lightblue')),
row=1, col=1
)
fig_timeline.add_trace(
go.Scatter(x=daily_stats.index, y=daily_stats['post_count_ma7'],
mode='lines', name='7-Day Average',
line=dict(color='blue', width=2),
connectgaps=False),
row=1, col=1
)
# Engagement metrics
fig_timeline.add_trace(
go.Scatter(x=daily_stats.index, y=daily_stats['score_ma7'],
mode='lines', name='Avg Score (7-day)',
line=dict(color='orange'),
connectgaps=False),
row=2, col=1
)
fig_timeline.add_trace(
go.Scatter(x=daily_stats.index, y=daily_stats['num_comments'].rolling(window=7, min_periods=1).mean(),
mode='lines', name='Avg Comments (7-day)',
line=dict(color='green'),
connectgaps=False),
row=2, col=1
)
fig_timeline.update_layout(
height=600,
showlegend=True,
title=f'Activity Timeline ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})',
xaxis=dict(type='date', autorange=True),
xaxis2=dict(type='date', autorange=True)
)
fig_timeline.update_xaxes(title_text="Date", row=2, col=1)
fig_timeline.update_yaxes(title_text="Count", row=1, col=1)
fig_timeline.update_yaxes(title_text="Value", row=2, col=1)
figures['timeline'] = fig_timeline
# 3. Monthly trend analysis (only show if data spans at least 30 days)
if date_range >= 30:
monthly_data = df.set_index('created_utc').resample('M').agg({
'id': 'count',
'score': ['mean', 'sum'],
'num_comments': ['mean', 'sum']
})
# Filter out months with no posts
monthly_data = monthly_data[monthly_data[('id', 'count')] > 0]
fig_monthly = go.Figure()
fig_monthly.add_trace(go.Bar(
x=monthly_data.index,
y=monthly_data[('id', 'count')],
name='Monthly Posts',
marker_color='lightblue'
))
fig_monthly.add_trace(go.Scatter(
x=monthly_data.index,
y=monthly_data[('score', 'mean')],
name='Avg Score',
yaxis='y2',
line=dict(color='red', width=2),
connectgaps=False
))
fig_monthly.update_layout(
title=f'Monthly Posting Trends ({date_min.strftime("%Y-%m")} to {date_max.strftime("%Y-%m")})',
xaxis_title='Month',
xaxis=dict(type='date', autorange=True),
yaxis=dict(title='Post Count', side='left'),
yaxis2=dict(title='Average Score', side='right', overlaying='y'),
height=400,
hovermode='x unified'
)
figures['monthly'] = fig_monthly
else:
# For shorter periods, show weekly trends instead
weekly_data = df.set_index('created_utc').resample('W').agg({
'id': 'count',
'score': ['mean', 'sum'],
'num_comments': ['mean', 'sum']
})
# Filter out weeks with no posts
weekly_data = weekly_data[weekly_data[('id', 'count')] > 0]
fig_weekly = go.Figure()
fig_weekly.add_trace(go.Bar(
x=weekly_data.index,
y=weekly_data[('id', 'count')],
name='Weekly Posts',
marker_color='lightblue'
))
fig_weekly.add_trace(go.Scatter(
x=weekly_data.index,
y=weekly_data[('score', 'mean')],
name='Avg Score',
yaxis='y2',
line=dict(color='red', width=2),
connectgaps=False
))
fig_weekly.update_layout(
title=f'Weekly Posting Trends ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})',
xaxis_title='Week',
xaxis=dict(type='date', autorange=True),
yaxis=dict(title='Post Count', side='left'),
yaxis2=dict(title='Average Score', side='right', overlaying='y'),
height=400,
hovermode='x unified'
)
figures['monthly'] = fig_weekly # Use same key for consistency
# 4. Posting patterns by flair
if 'flair' in df.columns:
flair_time = df.groupby([pd.Grouper(key='created_utc', freq='W'), 'flair']).size().reset_index(name='count')
top_flairs = df['flair'].value_counts().head(10).index
flair_time_filtered = flair_time[flair_time['flair'].isin(top_flairs)]
fig_flair = px.line(flair_time_filtered, x='created_utc', y='count',
color='flair',
title=f'Weekly Posting Patterns by Flair ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})',
labels={'count': 'Number of Posts', 'created_utc': 'Week'})
fig_flair.update_layout(
height=400,
xaxis=dict(type='date', autorange=True)
)
figures['flair'] = fig_flair
return figures
def create_engagement_analytics(df: pd.DataFrame) -> Dict[str, go.Figure]:
"""
Create engagement and interaction analytics
Args:
df: DataFrame with Reddit data
Returns:
Dictionary of engagement figures
"""
figures = {}
# 1. Score distribution
fig_score_dist = go.Figure()
fig_score_dist.add_trace(go.Histogram(
x=df['score'],
nbinsx=50,
name='Score Distribution',
marker_color='orange'
))
fig_score_dist.update_layout(
title='Post Score Distribution',
xaxis_title='Score',
yaxis_title='Frequency',
height=350
)
figures['score_dist'] = fig_score_dist
# 2. Engagement correlation
fig_correlation = px.scatter(
df, x='score', y='num_comments',
size='upvote_ratio', color='is_self',
title='Score vs Comments Correlation',
labels={'score': 'Post Score', 'num_comments': 'Number of Comments',
'is_self': 'Post Type', 'upvote_ratio': 'Upvote Ratio'},
hover_data=['title']
)
fig_correlation.update_layout(height=400)
figures['correlation'] = fig_correlation
# 3. Top performing posts
top_posts = df.nlargest(10, 'score')[['title', 'score', 'num_comments', 'author']]
fig_top = go.Figure(data=[
go.Bar(name='Score', x=top_posts['title'].str[:30] + '...', y=top_posts['score']),
go.Bar(name='Comments', x=top_posts['title'].str[:30] + '...', y=top_posts['num_comments'])
])
fig_top.update_layout(
title='Top 10 Posts by Engagement',
barmode='group',
height=400,
xaxis_tickangle=-45
)
figures['top_posts'] = fig_top
return figures
def main():
"""Main application function"""
# Initialize session state
if 'scraper' not in st.session_state:
st.session_state.scraper = None
if 'advanced_scraper' not in st.session_state:
st.session_state.advanced_scraper = None
if 'data' not in st.session_state:
st.session_state.data = pd.DataFrame()
if 'last_fetch' not in st.session_state:
st.session_state.last_fetch = None
if 'comment_hierarchies' not in st.session_state:
st.session_state.comment_hierarchies = {}
# Header
st.markdown('<h1 class="main-header">πŸ“Š Reddit Research Dashboard</h1>', unsafe_allow_html=True)
st.markdown("Optimized for high-volume data collection and temporal analytics")
# Load .env file if it exists
env_vars = load_env_file(".env")
if not env_vars:
# Try parent directory
env_vars = load_env_file("../.env")
# Sidebar configuration
with st.sidebar:
st.header("βš™οΈ Configuration")
# Show .env detection status
if env_vars:
st.success("βœ… .env file detected and loaded")
# API Credentials - prioritize .env, then environment variables
with st.expander("πŸ”‘ Reddit API Credentials", expanded=not bool(env_vars)):
default_client_id = env_vars.get("REDDIT_CLIENT_ID", os.environ.get("REDDIT_CLIENT_ID", ""))
default_client_secret = env_vars.get("REDDIT_CLIENT_SECRET", os.environ.get("REDDIT_CLIENT_SECRET", ""))
default_user_agent = env_vars.get("REDDIT_USER_AGENT", os.environ.get("REDDIT_USER_AGENT", "RedditResearch/1.0"))
client_id = st.text_input(
"Client ID",
value=default_client_id,
type="password",
help="Your Reddit API client ID (auto-populated from .env if available)"
)
client_secret = st.text_input(
"Client Secret",
value=default_client_secret,
type="password",
help="Your Reddit API client secret (auto-populated from .env if available)"
)
user_agent = st.text_input(
"User Agent",
value=default_user_agent,
help="User agent string for API requests (auto-populated from .env if available)"
)
if st.button("Initialize Scraper", type="primary"):
if client_id and client_secret:
try:
st.session_state.scraper = OptimizedRedditScraper(
client_id, client_secret, user_agent
)
if ADVANCED_FEATURES:
st.session_state.advanced_scraper = AdvancedRedditScraper(
client_id, client_secret, user_agent
)
st.success("βœ… Scrapers initialized successfully (with advanced features)!")
else:
st.success("βœ… Scraper initialized successfully!")
except Exception as e:
st.error(f"❌ Failed to initialize: {e}")
else:
st.warning("⚠️ Please provide API credentials")
# Data Collection Settings
st.header("πŸ“₯ Data Collection")
# Show advanced mode only if features are available
if ADVANCED_FEATURES:
collection_mode = st.radio(
"Collection Mode",
["Single Subreddit", "Multiple Subreddits", "Advanced with Hierarchy"]
)
else:
collection_mode = st.radio(
"Collection Mode",
["Single Subreddit", "Multiple Subreddits"]
)
if collection_mode == "Single Subreddit":
subreddit_name = st.text_input("Subreddit Name", value="CUNY")
subreddits = [subreddit_name]
elif collection_mode == "Multiple Subreddits":
subreddit_input = st.text_area(
"Subreddits (one per line)",
value="CUNY\nBaruch\nHunterCollege",
height=100
)
subreddits = [s.strip() for s in subreddit_input.split('\n') if s.strip()]
else:
# Advanced with Hierarchy (only if ADVANCED_FEATURES is True)
subreddit_name = st.text_input("Subreddit Name", value="CUNY")
subreddits = [subreddit_name]
use_checkpoint = st.checkbox("Enable checkpoint/resume", value=True)
if use_checkpoint:
checkpoint_name = st.text_input("Checkpoint name", value=f"{subreddit_name}_checkpoint")
# Advanced settings
with st.expander("βš™οΈ Advanced Settings"):
sort_by = st.selectbox(
"Sort By",
["hot", "new", "top", "rising"],
help="How to sort posts"
)
limit = st.slider(
"Posts per Subreddit",
min_value=50,
max_value=500,
value=200,
step=50,
help="Number of posts to fetch (optimized for 200+)"
)
if sort_by == "top":
time_filter = st.selectbox(
"Time Filter",
["hour", "day", "week", "month", "year", "all"],
index=3
)
else:
time_filter = "month"
batch_size = st.number_input(
"Batch Size",
min_value=10,
max_value=50,
value=25,
help="Posts processed per batch"
)
cache_ttl = st.number_input(
"Cache Duration (minutes)",
min_value=5,
max_value=120,
value=60,
help="How long to cache results"
)
# Main content area with tabs
if st.session_state.scraper:
# Create main tabs
main_tab1, main_tab2 = st.tabs(["πŸ“₯ Live Collection", "πŸ“Š Analytics & Metrics"])
with main_tab1:
st.header("Live Data Collection")
col1, col2, col3 = st.columns(3)
with col1:
if st.button("πŸš€ Start Collection", type="primary", width="stretch"):
# Initialize/clear stream posts
st.session_state.stream_posts = []
# Create display containers
status_text = st.empty()
progress_bar = st.progress(0)
st.session_state.progress_bar = progress_bar
# Fixed position containers for stats and stream
stats_placeholder = st.empty()
stream_placeholder = st.empty()
status_text.info("πŸš€ Starting collection...")
try:
if collection_mode == "Advanced with Hierarchy":
# Advanced scraping with comment hierarchies
status_text.info(f"Advanced scraping r/{subreddits[0]}...")
checkpoint = checkpoint_name if use_checkpoint else None
results = st.session_state.advanced_scraper.scrape_with_hierarchy(
subreddits[0], limit=limit, checkpoint_name=checkpoint
)
st.session_state.comment_hierarchies = results['hierarchies']
# Convert to DataFrame
df = pd.DataFrame(results['submissions'])
if df.empty:
df = pd.DataFrame()
else:
df['created_utc'] = pd.to_datetime(df['created_utc'])
st.session_state.data = df
status_text.success(f"βœ… Scraped {len(results['submissions'])} posts with {len(results['comments'])} comments!")
elif len(subreddits) == 1:
# Standard single subreddit with streaming
status_text.info(f"Collecting from r/{subreddits[0]}...")
# Show header for stats
with stats_placeholder:
st.subheader("πŸ“Š Live Collection Progress")
# Pass the placeholders to the scraper
df = st.session_state.scraper.fetch_subreddit_data_verbose(
subreddits[0], sort_by, limit, time_filter,
(stats_placeholder, stream_placeholder)
)
st.session_state.data = df if not df.empty else pd.DataFrame()
if len(df) > 0:
status_text.success(f"βœ… Collected {len(df)} posts!")
else:
status_text.warning("⚠️ No posts collected")
else:
# Multiple subreddits with streaming
status_text.info(f"Collecting from {len(subreddits)} subreddits...")
with stats_placeholder:
st.subheader("πŸ“Š Live Collection Progress")
all_data = []
for idx, sub in enumerate(subreddits):
status_text.info(f"Collecting {idx+1}/{len(subreddits)}: r/{sub}...")
df = st.session_state.scraper.fetch_subreddit_data_verbose(
sub, sort_by, limit, time_filter,
(stats_placeholder, stream_placeholder)
)
all_data.append(df)
if all_data:
df = pd.concat(all_data, ignore_index=True)
else:
df = pd.DataFrame()
st.session_state.data = df
status_text.success(f"βœ… Collected {len(df)} total posts!")
st.session_state.last_fetch = datetime.now()
except Exception as e:
error_msg = str(e)
# Don't show PRAW iterator exhaustion errors
if "Ran out of input" not in error_msg and "prawcore" not in error_msg.lower():
status_text.error(f"❌ Error: {error_msg}")
elif st.session_state.data.empty:
status_text.warning("⚠️ No posts could be fetched. Try adjusting your filters.")
with col2:
if not st.session_state.data.empty:
st.download_button(
"πŸ“₯ Download CSV",
st.session_state.data.to_csv(index=False),
file_name=f"reddit_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime="text/csv",
width="stretch"
)
with col3:
if st.session_state.last_fetch:
st.info(f"Last: {st.session_state.last_fetch.strftime('%H:%M:%S')}")
# Analytics & Metrics Tab
with main_tab2:
if not st.session_state.data.empty:
df = st.session_state.data
# Summary metrics at top
st.header("πŸ“ˆ Summary Metrics")
col1, col2, col3, col4, col5 = st.columns(5)
with col1:
st.metric("Total Posts", f"{len(df):,}")
with col2:
st.metric("Unique Authors", f"{df['author'].nunique():,}")
with col3:
st.metric("Avg Score", f"{df['score'].mean():.1f}")
with col4:
st.metric("Avg Comments", f"{df['num_comments'].mean():.1f}")
with col5:
st.metric("Subreddits", len(df['subreddit'].unique()))
# Tabbed interface for different analyses
tab1, tab2, tab3, tab4, tab5 = st.tabs([
"πŸ“Š Temporal Analytics",
"πŸ’¬ Engagement Analysis",
"πŸ“‹ Raw Data",
"πŸ” Search & Filter",
"🌳 Comment Hierarchies"
])
with tab1:
st.header("Temporal Analytics")
# Generate temporal visualizations
temporal_figs = create_temporal_visualizations(df)
# Activity heatmap
st.plotly_chart(temporal_figs.get('heatmap'), use_container_width=True)
# Time series
st.plotly_chart(temporal_figs.get('timeline'), use_container_width=True)
# Monthly trends
col1, col2 = st.columns(2)
with col1:
st.plotly_chart(temporal_figs.get('monthly'), use_container_width=True)
with col2:
if 'flair' in temporal_figs:
st.plotly_chart(temporal_figs.get('flair'), use_container_width=True)
with tab2:
st.header("Engagement Analysis")
engagement_figs = create_engagement_analytics(df)
col1, col2 = st.columns(2)
with col1:
st.plotly_chart(engagement_figs['score_dist'], use_container_width=True)
with col2:
st.plotly_chart(engagement_figs['correlation'], use_container_width=True)
st.plotly_chart(engagement_figs['top_posts'], use_container_width=True)
with tab3:
st.header("Raw Data View")
# Data filtering options
col1, col2, col3 = st.columns(3)
with col1:
min_score = st.number_input("Min Score", value=0)
with col2:
min_comments = st.number_input("Min Comments", value=0)
with col3:
author_filter = st.text_input("Author Filter")
# Apply filters
filtered_df = df[
(df['score'] >= min_score) &
(df['num_comments'] >= min_comments)
]
if author_filter:
filtered_df = filtered_df[
filtered_df['author'].str.contains(author_filter, case=False, na=False)
]
st.dataframe(
filtered_df[['title', 'author', 'score', 'num_comments',
'created_utc', 'subreddit', 'flair']],
width="stretch",
height=500
)
st.info(f"Showing {len(filtered_df)} of {len(df)} posts")
with tab4:
st.header("Search & Filter")
search_query = st.text_input("Search in titles and text", placeholder="Enter keywords...")
if search_query:
mask = (
df['title'].str.contains(search_query, case=False, na=False) |
df['selftext'].str.contains(search_query, case=False, na=False)
)
search_results = df[mask]
st.info(f"Found {len(search_results)} posts matching '{search_query}'")
if not search_results.empty:
for idx, row in search_results.head(10).iterrows():
with st.expander(f"πŸ“ {row['title'][:100]}..."):
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Score", row['score'])
with col2:
st.metric("Comments", row['num_comments'])
with col3:
st.metric("Date", row['created_utc'].strftime('%Y-%m-%d'))
st.write(f"**Author:** u/{row['author']}")
st.write(f"**Subreddit:** r/{row['subreddit']}")
if row['selftext']:
st.write(f"**Text:** {row['selftext'][:500]}...")
st.write(f"[View on Reddit]({row['permalink']})")
with tab5:
st.header("Comment Hierarchies")
if not ADVANCED_FEATURES:
st.info("⚠️ Comment hierarchy analysis requires additional dependencies. This feature is optional and not needed for basic data collection.")
elif st.session_state.comment_hierarchies:
# Select submission to view
submission_ids = list(st.session_state.comment_hierarchies.keys())
selected_sub = st.selectbox("Select Submission", submission_ids)
if selected_sub:
hierarchy = st.session_state.comment_hierarchies[selected_sub]
# Display submission info
if hierarchy['submission']:
st.subheader(f"πŸ“ {hierarchy['submission'].get('title', 'No Title')}")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Score", hierarchy['submission'].get('score', 0))
with col2:
st.metric("Comments", len(hierarchy.get('comments', {})))
with col3:
st.metric("Author", hierarchy['submission'].get('author', '[deleted]'))
# Visualize comment tree
def display_comment_tree(comments, level=0):
for comment_id, comment in comments.items():
indent = " " * level
with st.expander(f"{indent}πŸ’¬ {comment.get('author', '[deleted]')} - Score: {comment.get('score', 0)}"):
st.write(comment.get('body', '')[:500])
if 'replies' in comment and comment['replies']:
st.write("**Replies:**")
display_comment_tree(comment['replies'], level + 1)
st.subheader("Comment Thread Structure")
if hierarchy.get('hierarchy'):
display_comment_tree(hierarchy['hierarchy'])
else:
st.info("No comments found for this submission")
# Orphan statistics
if st.session_state.get('advanced_scraper'):
orphan_stats = st.session_state.advanced_scraper.hierarchy_tracker.get_orphan_statistics()
if orphan_stats['orphaned_count'] > 0:
st.warning(f"⚠️ {orphan_stats['orphaned_count']} orphaned comments detected ({orphan_stats['orphan_rate']:.1%} orphan rate)")
else:
st.info("Use 'Advanced with Hierarchy' collection mode to analyze comment structures")
else:
# Empty state - no data collected yet
st.info("πŸ‘† Configure your settings in the sidebar and click 'Start Collection' to begin")
# Quick start guide
with st.expander("πŸš€ Quick Start Guide"):
st.markdown("""
### Getting Started
1. **Set up API Credentials**
- Get your Reddit API credentials from [reddit.com/prefs/apps](https://www.reddit.com/prefs/apps)
- Enter them in the sidebar
- Click "Initialize Scraper"
2. **Choose Collection Mode**
- **Single Subreddit**: Analyze one community in depth
- **Multiple Subreddits**: Collect from multiple communities
3. **Configure Settings**
- Adjust the number of posts (200+ recommended)
- Choose sort method (hot, new, top, rising)
- Set time filter for top posts
4. **Fetch & Analyze**
- Click "Fetch Data" to start collection
- Explore temporal patterns, engagement metrics
- Export results as CSV for further analysis
### Features
- **Batch Processing**: Efficiently handles 200+ posts
- **Caching**: Reduces API calls with smart caching
- **Temporal Analytics**: Hour/day/month patterns
- **Engagement Metrics**: Score, comments, correlations
""")
else:
st.warning("⚠️ Please initialize the scraper with your Reddit API credentials in the sidebar")
# API setup instructions
with st.expander("πŸ“– How to get Reddit API credentials"):
st.markdown("""
### Setting up Reddit API Access
1. **Create a Reddit Account** (if you don't have one)
- Go to [reddit.com](https://www.reddit.com) and sign up
2. **Create an App**
- Visit [reddit.com/prefs/apps](https://www.reddit.com/prefs/apps)
- Click "Create App" or "Create Another App"
- Fill in the form:
- **Name**: Your app name (e.g., "Research Dashboard")
- **App Type**: Select "script"
- **Description**: Optional
- **About URL**: Leave blank
- **Redirect URI**: http://localhost:8000
- Click "Create app"
3. **Get Your Credentials**
- **Client ID**: The string under "personal use script"
- **Client Secret**: The secret key shown
- **User Agent**: Format: "Platform:AppName:Version (by /u/YourUsername)"
4. **Enter in Sidebar**
- Copy your credentials to the sidebar fields
- Click "Initialize Scraper"
""")
if __name__ == "__main__":
main()