Spaces:

milwright
/

reddit-dashboard

Sleeping

App Files Files Community

reddit-dashboard / src /streamlit_app.py

milwright

add connectgaps=false to prevent misleading lines across data gaps

dd09fee 3 months ago

raw

history blame contribute delete

54.6 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import praw
	import time
	from datetime import datetime, timedelta
	import json
	import os
	from typing import List, Dict, Any, Optional, Tuple
	import concurrent.futures
	from functools import lru_cache
	import hashlib
	import pytz
	import sqlite3
	import networkx as nx
	from pathlib import Path
	# Advanced features optional - will gracefully degrade if not available
	try:
	from advanced_reddit_scraper import (
	AdvancedRedditScraper,
	ExponentialBackoff,
	CommentHierarchyTracker,
	CheckpointManager
	)
	ADVANCED_FEATURES = True
	except ImportError:
	ADVANCED_FEATURES = False

	def load_env_file(env_path: str = ".env") -> Dict[str, str]:
	"""
	Load environment variables from .env file

	Args:
	env_path: Path to .env file

	Returns:
	Dictionary of environment variables
	"""
	env_vars = {}
	env_file = Path(env_path)

	if env_file.exists():
	with open(env_file, 'r') as f:
	for line in f:
	line = line.strip()
	if line and not line.startswith('#') and '=' in line:
	key, value = line.split('=', 1)
	key = key.strip()
	value = value.strip().strip('"').strip("'")
	env_vars[key] = value

	return env_vars

	st.set_page_config(
	page_title="Reddit Research Dashboard",
	page_icon="📊",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	st.markdown("""
	<style>
	/* Full width container */
	.main .block-container {
	max-width: 100%;
	padding-left: 2rem;
	padding-right: 2rem;
	}
	.main-header {
	font-size: 2.5rem;
	font-weight: bold;
	margin-bottom: 1rem;
	background: linear-gradient(90deg, #FF4500 0%, #FFA500 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	}
	.metric-card {
	background-color: #f0f2f6;
	padding: 1rem;
	border-radius: 0.5rem;
	border-left: 3px solid #FF4500;
	}
	.stProgress > div > div > div > div {
	background-color: #FF4500;
	}
	.info-box {
	background-color: #e8f4f8;
	padding: 1rem;
	border-radius: 0.5rem;
	margin: 1rem 0;
	border-left: 3px solid #1f77b4;
	}
	.stream-output {
	background-color: #f8f9fa;
	padding: 1rem;
	border-radius: 0.5rem;
	font-family: 'SF Mono', Monaco, monospace;
	font-size: 0.85rem;
	max-height: 600px;
	overflow-y: auto;
	margin: 1rem 0;
	border: 1px solid #dee2e6;
	}
	.stream-item {
	padding: 0.75rem;
	margin: 0.5rem 0;
	background: white;
	border-radius: 0.25rem;
	border-left: 3px solid #FF4500;
	box-shadow: 0 1px 3px rgba(0,0,0,0.1);
	}
	.stream-title {
	font-weight: bold;
	color: #1a1a1b;
	margin-bottom: 0.25rem;
	}
	.stream-meta {
	color: #787c7e;
	font-size: 0.8rem;
	}
	.stream-stats {
	background-color: #e8f4f8;
	padding: 0.5rem;
	border-radius: 0.25rem;
	margin: 0.5rem 0;
	font-size: 0.9rem;
	}
	</style>
	""", unsafe_allow_html=True)

	class OptimizedRedditScraper:
	"""
	Optimized Reddit scraper with batch processing, caching, and temporal analytics
	"""

	def __init__(self, client_id: str, client_secret: str, user_agent: str):
	"""Initialize with Reddit API credentials"""
	self.reddit = praw.Reddit(
	client_id=client_id,
	client_secret=client_secret,
	user_agent=user_agent,
	check_for_async=False
	)
	self.last_request_time = 0
	self.min_delay = 0.5

	def fetch_subreddit_data_verbose(self, subreddit_name: str, sort_by: str = "hot",
	limit: int = 200, time_filter: str = "month",
	log_container=None) -> pd.DataFrame:
	"""
	Fetch Reddit data with verbose logging

	Args:
	subreddit_name: Name of subreddit to scrape
	sort_by: Sort method (hot, new, top, rising)
	limit: Number of posts to fetch (optimized for 200+ items)
	time_filter: Time filter for top posts
	log_container: Streamlit container for logging output

	Returns:
	DataFrame with Reddit posts data
	"""
	def stream_post(post_data, stream_container):
	"""Display a post as it's collected"""
	if stream_container:
	timestamp = datetime.now().strftime("%H:%M:%S")
	with stream_container.container():
	with st.expander(f"📝 {post_data['title'][:80]}...", expanded=False):
	col1, col2, col3, col4 = st.columns(4)
	with col1:
	st.metric("Score", post_data['score'])
	with col2:
	st.metric("Comments", post_data['num_comments'])
	with col3:
	st.text(f"u/{post_data['author']}")
	with col4:
	st.text(timestamp)

	def update_stats(stats_container, total, authors, comments):
	"""Update collection statistics"""
	if stats_container:
	stats_container.empty()
	with stats_container:
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("📊 Posts", total)
	with col2:
	st.metric("👥 Authors", authors)
	with col3:
	st.metric("💬 Comments", f"{comments:,}")

	# Initialize streaming containers
	stats_container = None
	stream_container = None
	if log_container:
	# Check if log_container is a tuple of (stats, stream)
	if isinstance(log_container, tuple):
	stats_container, stream_container = log_container
	else:
	stats_container = log_container
	stream_container = log_container

	data = []
	try:
	subreddit = self.reddit.subreddit(subreddit_name)

	# Choose appropriate method based on sort_by
	if sort_by == "top":
	submissions = subreddit.top(limit=limit, time_filter=time_filter)
	elif sort_by == "new":
	submissions = subreddit.new(limit=limit)
	elif sort_by == "rising":
	submissions = subreddit.rising(limit=limit)
	else:
	submissions = subreddit.hot(limit=limit)

	# Batch processing with rate limiting
	batch_size = 25
	batch = []
	batch_num = 1
	post_count = 0
	total_comments = 0

	try:
	# Convert to list to handle iterator exhaustion gracefully
	submissions_list = []
	try:
	for submission in submissions:
	try:
	# Force PRAW to load the submission by accessing an attribute
	_ = submission.id
	submissions_list.append(submission)
	if len(submissions_list) >= limit:
	break
	except Exception as sub_error:
	# Skip submissions that fail to load
	continue
	except StopIteration:
	pass # Iterator exhausted naturally
	except Exception as fetch_error:
	error_msg = str(fetch_error)
	if "Ran out of input" in error_msg or "prawcore" in error_msg.lower():
	# PRAW iterator exhausted - not an error, just end of data
	pass
	else:
	if log_container:
	st.warning(f"⚠️ Stopped early: {error_msg}")
	if not submissions_list:
	if log_container:
	st.error(f"No data could be fetched: {error_msg}")
	raise

	for i, submission in enumerate(submissions_list):
	try:
	# Rate limiting before fetching submission data
	current_time = time.time()
	if current_time - self.last_request_time < self.min_delay:
	time.sleep(self.min_delay - (current_time - self.last_request_time))
	self.last_request_time = time.time()

	batch.append(submission)
	post_count += 1

	if len(batch) >= batch_size or post_count >= limit:
	# Process batch
	for idx, sub in enumerate(batch):
	try:
	# Safely extract all attributes with error handling
	try:
	post_id = sub.id
	post_title = sub.title
	post_author = str(sub.author) if sub.author else '[deleted]'
	post_created = datetime.fromtimestamp(sub.created_utc, tz=pytz.UTC)
	post_score = sub.score
	post_comments = sub.num_comments
	post_ratio = sub.upvote_ratio
	post_text = sub.selftext[:500] if sub.selftext else ''
	post_url = sub.url
	post_flair = sub.link_flair_text or 'No Flair'
	post_video = sub.is_video
	post_self = sub.is_self
	post_permalink = f"https://reddit.com{sub.permalink}"
	except AttributeError as attr_error:
	# Missing attribute - skip this post
	continue
	except Exception as access_error:
	# Any other error accessing attributes - skip
	continue

	post_data = {
	'id': post_id,
	'title': post_title,
	'author': post_author,
	'created_utc': post_created,
	'score': post_score,
	'num_comments': post_comments,
	'upvote_ratio': post_ratio,
	'selftext': post_text,
	'url': post_url,
	'subreddit': subreddit_name,
	'flair': post_flair,
	'is_video': post_video,
	'is_self': post_self,
	'permalink': post_permalink
	}
	data.append(post_data)
	total_comments += post_data['num_comments']

	# Stream the post to UI
	stream_post(post_data, stream_container)

	except Exception as post_error:
	# Skip posts that cause any error
	continue
	# Update stats
	if log_container:
	unique_authors = len(set(d['author'] for d in data))
	update_stats(stats_container, len(data), unique_authors, total_comments)

	batch = []
	batch_num += 1

	# Update progress
	if st.session_state.get('progress_bar'):
	progress = min(post_count / limit, 1.0)
	st.session_state.progress_bar.progress(progress)

	# Stop if we've reached the limit
	if post_count >= limit:
	break

	except StopIteration:
	break
	except Exception as iter_error:
	continue

	# Process any remaining items in batch
	if batch:
	for idx, sub in enumerate(batch):
	try:
	# Safely extract all attributes
	try:
	post_id = sub.id
	post_title = sub.title
	post_author = str(sub.author) if sub.author else '[deleted]'
	post_created = datetime.fromtimestamp(sub.created_utc, tz=pytz.UTC)
	post_score = sub.score
	post_comments = sub.num_comments
	post_ratio = sub.upvote_ratio
	post_text = sub.selftext[:500] if sub.selftext else ''
	post_url = sub.url
	post_flair = sub.link_flair_text or 'No Flair'
	post_video = sub.is_video
	post_self = sub.is_self
	post_permalink = f"https://reddit.com{sub.permalink}"
	except Exception:
	# Skip posts that fail attribute access
	continue

	post_data = {
	'id': post_id,
	'title': post_title,
	'author': post_author,
	'created_utc': post_created,
	'score': post_score,
	'num_comments': post_comments,
	'upvote_ratio': post_ratio,
	'selftext': post_text,
	'url': post_url,
	'subreddit': subreddit_name,
	'flair': post_flair,
	'is_video': post_video,
	'is_self': post_self,
	'permalink': post_permalink
	}
	data.append(post_data)
	total_comments += post_data['num_comments']
	stream_post(post_data, stream_container)
	except Exception:
	# Skip any problematic posts
	continue

	except StopIteration:
	pass

	# Final stats update
	if log_container:
	unique_authors = len(set(d['author'] for d in data))
	update_stats(stats_container, len(data), unique_authors, total_comments)

	except Exception as e:
	error_msg = str(e)
	# Don't show scary errors for common PRAW issues
	if "Ran out of input" in error_msg or "prawcore" in error_msg.lower():
	if log_container and len(data) == 0:
	st.warning("⚠️ No posts could be fetched. The subreddit may be empty or private.")
	else:
	if log_container:
	st.error(f"❌ Error: {error_msg}")
	if len(data) == 0: # Only raise if we got no data at all
	raise

	# Return whatever data we managed to collect
	if len(data) == 0 and log_container:
	st.info("ℹ️ No posts were collected. Try adjusting your filters or selecting a different subreddit.")

	return pd.DataFrame(data)

	def fetch_subreddit_data(self, subreddit_name: str, sort_by: str = "hot",
	limit: int = 200, time_filter: str = "month") -> pd.DataFrame:
	"""
	Fetch data with manual session-based caching
	"""
	# Create cache key
	cache_key = f"{subreddit_name}_{sort_by}_{limit}_{time_filter}"

	# Check if data exists in session state cache
	if 'data_cache' not in st.session_state:
	st.session_state.data_cache = {}

	if cache_key in st.session_state.data_cache:
	cache_entry = st.session_state.data_cache[cache_key]
	# Check if cache is still valid (1 hour TTL)
	if (datetime.now() - cache_entry['timestamp']).total_seconds() < 3600:
	return cache_entry['data']

	# Fetch new data
	df = self.fetch_subreddit_data_verbose(subreddit_name, sort_by, limit, time_filter, None)

	# Store in cache
	st.session_state.data_cache[cache_key] = {
	'data': df,
	'timestamp': datetime.now()
	}

	return df

	def fetch_multiple_subreddits(self, subreddits: List[str], limit_per: int = 100,
	sort_by: str = "hot") -> pd.DataFrame:
	"""
	Fetch data from multiple subreddits with manual caching

	Args:
	subreddits: List of subreddit names
	limit_per: Posts per subreddit
	sort_by: Sort method

	Returns:
	Combined DataFrame
	"""
	# Create cache key
	cache_key = f"multi_{'_'.join(sorted(subreddits))}_{sort_by}_{limit_per}"

	# Check cache
	if 'data_cache' not in st.session_state:
	st.session_state.data_cache = {}

	if cache_key in st.session_state.data_cache:
	cache_entry = st.session_state.data_cache[cache_key]
	# Check if cache is still valid (30 min TTL)
	if (datetime.now() - cache_entry['timestamp']).total_seconds() < 1800:
	return cache_entry['data']

	# Fetch new data
	all_data = []

	with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
	future_to_sub = {
	executor.submit(self.fetch_subreddit_data, sub, sort_by, limit_per): sub
	for sub in subreddits
	}

	for future in concurrent.futures.as_completed(future_to_sub):
	sub = future_to_sub[future]
	try:
	data = future.result()
	all_data.append(data)
	except Exception as e:
	st.error(f"Error fetching r/{sub}: {e}")

	if all_data:
	df = pd.concat(all_data, ignore_index=True)
	else:
	df = pd.DataFrame()

	# Store in cache
	st.session_state.data_cache[cache_key] = {
	'data': df,
	'timestamp': datetime.now()
	}

	return df

	def create_temporal_visualizations(df: pd.DataFrame) -> Dict[str, go.Figure]:
	"""
	Create comprehensive temporal analytics visualizations

	Args:
	df: DataFrame with Reddit data

	Returns:
	Dictionary of Plotly figures
	"""
	figures = {}

	# Ensure datetime column
	if 'created_utc' in df.columns:
	df['created_utc'] = pd.to_datetime(df['created_utc'])
	df = df.sort_values('created_utc')

	# Get actual date range of collected data with padding
	date_min = df['created_utc'].min()
	date_max = df['created_utc'].max()
	date_range = (date_max - date_min).days

	# Add 2% padding to prevent edge clipping
	padding = pd.Timedelta(days=max(1, int(date_range * 0.02)))
	date_min_padded = date_min - padding
	date_max_padded = date_max + padding

	# 1. Hourly activity heatmap
	df['hour'] = df['created_utc'].dt.hour
	df['day_of_week'] = df['created_utc'].dt.day_name()

	heatmap_data = df.groupby(['day_of_week', 'hour']).size().reset_index(name='count')
	pivot_data = heatmap_data.pivot(index='day_of_week', columns='hour', values='count').fillna(0)

	# Reorder days
	days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
	pivot_data = pivot_data.reindex(days_order)

	fig_heatmap = go.Figure(data=go.Heatmap(
	z=pivot_data.values,
	x=pivot_data.columns,
	y=pivot_data.index,
	colorscale='RdYlBu_r',
	text=pivot_data.values.astype(int),
	texttemplate='%{text}',
	textfont={"size": 8},
	hovertemplate='%{y}<br>%{x}:00<br>Posts: %{z}<extra></extra>'
	))

	fig_heatmap.update_layout(
	title='Activity Heatmap by Hour and Day',
	xaxis_title='Hour of Day',
	yaxis_title='Day of Week',
	height=400
	)
	figures['heatmap'] = fig_heatmap

	# 2. Time series with rolling average - only include days with actual data
	daily_stats = df.set_index('created_utc').resample('D').agg({
	'id': 'count',
	'score': 'mean',
	'num_comments': 'mean'
	}).rename(columns={'id': 'post_count'})

	# Filter out days with no posts to prevent misleading gaps
	daily_stats = daily_stats[daily_stats['post_count'] > 0]

	# Calculate rolling averages
	daily_stats['post_count_ma7'] = daily_stats['post_count'].rolling(window=7, min_periods=1).mean()
	daily_stats['score_ma7'] = daily_stats['score'].rolling(window=7, min_periods=1).mean()

	fig_timeline = make_subplots(
	rows=2, cols=1,
	subplot_titles=('Daily Post Activity', 'Average Engagement Metrics'),
	vertical_spacing=0.1
	)

	# Post count
	fig_timeline.add_trace(
	go.Scatter(x=daily_stats.index, y=daily_stats['post_count'],
	mode='markers', name='Daily Posts', opacity=0.5,
	marker=dict(size=5, color='lightblue')),
	row=1, col=1
	)

	fig_timeline.add_trace(
	go.Scatter(x=daily_stats.index, y=daily_stats['post_count_ma7'],
	mode='lines', name='7-Day Average',
	line=dict(color='blue', width=2),
	connectgaps=False),
	row=1, col=1
	)

	# Engagement metrics
	fig_timeline.add_trace(
	go.Scatter(x=daily_stats.index, y=daily_stats['score_ma7'],
	mode='lines', name='Avg Score (7-day)',
	line=dict(color='orange'),
	connectgaps=False),
	row=2, col=1
	)

	fig_timeline.add_trace(
	go.Scatter(x=daily_stats.index, y=daily_stats['num_comments'].rolling(window=7, min_periods=1).mean(),
	mode='lines', name='Avg Comments (7-day)',
	line=dict(color='green'),
	connectgaps=False),
	row=2, col=1
	)

	fig_timeline.update_layout(
	height=600,
	showlegend=True,
	title=f'Activity Timeline ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})',
	xaxis=dict(type='date', autorange=True),
	xaxis2=dict(type='date', autorange=True)
	)
	fig_timeline.update_xaxes(title_text="Date", row=2, col=1)
	fig_timeline.update_yaxes(title_text="Count", row=1, col=1)
	fig_timeline.update_yaxes(title_text="Value", row=2, col=1)

	figures['timeline'] = fig_timeline

	# 3. Monthly trend analysis (only show if data spans at least 30 days)
	if date_range >= 30:
	monthly_data = df.set_index('created_utc').resample('M').agg({
	'id': 'count',
	'score': ['mean', 'sum'],
	'num_comments': ['mean', 'sum']
	})

	# Filter out months with no posts
	monthly_data = monthly_data[monthly_data[('id', 'count')] > 0]

	fig_monthly = go.Figure()

	fig_monthly.add_trace(go.Bar(
	x=monthly_data.index,
	y=monthly_data[('id', 'count')],
	name='Monthly Posts',
	marker_color='lightblue'
	))

	fig_monthly.add_trace(go.Scatter(
	x=monthly_data.index,
	y=monthly_data[('score', 'mean')],
	name='Avg Score',
	yaxis='y2',
	line=dict(color='red', width=2),
	connectgaps=False
	))

	fig_monthly.update_layout(
	title=f'Monthly Posting Trends ({date_min.strftime("%Y-%m")} to {date_max.strftime("%Y-%m")})',
	xaxis_title='Month',
	xaxis=dict(type='date', autorange=True),
	yaxis=dict(title='Post Count', side='left'),
	yaxis2=dict(title='Average Score', side='right', overlaying='y'),
	height=400,
	hovermode='x unified'
	)

	figures['monthly'] = fig_monthly
	else:
	# For shorter periods, show weekly trends instead
	weekly_data = df.set_index('created_utc').resample('W').agg({
	'id': 'count',
	'score': ['mean', 'sum'],
	'num_comments': ['mean', 'sum']
	})

	# Filter out weeks with no posts
	weekly_data = weekly_data[weekly_data[('id', 'count')] > 0]

	fig_weekly = go.Figure()

	fig_weekly.add_trace(go.Bar(
	x=weekly_data.index,
	y=weekly_data[('id', 'count')],
	name='Weekly Posts',
	marker_color='lightblue'
	))

	fig_weekly.add_trace(go.Scatter(
	x=weekly_data.index,
	y=weekly_data[('score', 'mean')],
	name='Avg Score',
	yaxis='y2',
	line=dict(color='red', width=2),
	connectgaps=False
	))

	fig_weekly.update_layout(
	title=f'Weekly Posting Trends ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})',
	xaxis_title='Week',
	xaxis=dict(type='date', autorange=True),
	yaxis=dict(title='Post Count', side='left'),
	yaxis2=dict(title='Average Score', side='right', overlaying='y'),
	height=400,
	hovermode='x unified'
	)

	figures['monthly'] = fig_weekly # Use same key for consistency

	# 4. Posting patterns by flair
	if 'flair' in df.columns:
	flair_time = df.groupby([pd.Grouper(key='created_utc', freq='W'), 'flair']).size().reset_index(name='count')
	top_flairs = df['flair'].value_counts().head(10).index
	flair_time_filtered = flair_time[flair_time['flair'].isin(top_flairs)]

	fig_flair = px.line(flair_time_filtered, x='created_utc', y='count',
	color='flair',
	title=f'Weekly Posting Patterns by Flair ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})',
	labels={'count': 'Number of Posts', 'created_utc': 'Week'})
	fig_flair.update_layout(
	height=400,
	xaxis=dict(type='date', autorange=True)
	)
	figures['flair'] = fig_flair

	return figures

	def create_engagement_analytics(df: pd.DataFrame) -> Dict[str, go.Figure]:
	"""
	Create engagement and interaction analytics

	Args:
	df: DataFrame with Reddit data

	Returns:
	Dictionary of engagement figures
	"""
	figures = {}

	# 1. Score distribution
	fig_score_dist = go.Figure()
	fig_score_dist.add_trace(go.Histogram(
	x=df['score'],
	nbinsx=50,
	name='Score Distribution',
	marker_color='orange'
	))
	fig_score_dist.update_layout(
	title='Post Score Distribution',
	xaxis_title='Score',
	yaxis_title='Frequency',
	height=350
	)
	figures['score_dist'] = fig_score_dist

	# 2. Engagement correlation
	fig_correlation = px.scatter(
	df, x='score', y='num_comments',
	size='upvote_ratio', color='is_self',
	title='Score vs Comments Correlation',
	labels={'score': 'Post Score', 'num_comments': 'Number of Comments',
	'is_self': 'Post Type', 'upvote_ratio': 'Upvote Ratio'},
	hover_data=['title']
	)
	fig_correlation.update_layout(height=400)
	figures['correlation'] = fig_correlation

	# 3. Top performing posts
	top_posts = df.nlargest(10, 'score')[['title', 'score', 'num_comments', 'author']]

	fig_top = go.Figure(data=[
	go.Bar(name='Score', x=top_posts['title'].str[:30] + '...', y=top_posts['score']),
	go.Bar(name='Comments', x=top_posts['title'].str[:30] + '...', y=top_posts['num_comments'])
	])
	fig_top.update_layout(
	title='Top 10 Posts by Engagement',
	barmode='group',
	height=400,
	xaxis_tickangle=-45
	)
	figures['top_posts'] = fig_top

	return figures

	def main():
	"""Main application function"""

	# Initialize session state
	if 'scraper' not in st.session_state:
	st.session_state.scraper = None
	if 'advanced_scraper' not in st.session_state:
	st.session_state.advanced_scraper = None
	if 'data' not in st.session_state:
	st.session_state.data = pd.DataFrame()
	if 'last_fetch' not in st.session_state:
	st.session_state.last_fetch = None
	if 'comment_hierarchies' not in st.session_state:
	st.session_state.comment_hierarchies = {}

	# Header
	st.markdown('<h1 class="main-header">📊 Reddit Research Dashboard</h1>', unsafe_allow_html=True)
	st.markdown("Optimized for high-volume data collection and temporal analytics")

	# Load .env file if it exists
	env_vars = load_env_file(".env")
	if not env_vars:
	# Try parent directory
	env_vars = load_env_file("../.env")

	# Sidebar configuration
	with st.sidebar:
	st.header("⚙️ Configuration")

	# Show .env detection status
	if env_vars:
	st.success("✅ .env file detected and loaded")

	# API Credentials - prioritize .env, then environment variables
	with st.expander("🔑 Reddit API Credentials", expanded=not bool(env_vars)):
	default_client_id = env_vars.get("REDDIT_CLIENT_ID", os.environ.get("REDDIT_CLIENT_ID", ""))
	default_client_secret = env_vars.get("REDDIT_CLIENT_SECRET", os.environ.get("REDDIT_CLIENT_SECRET", ""))
	default_user_agent = env_vars.get("REDDIT_USER_AGENT", os.environ.get("REDDIT_USER_AGENT", "RedditResearch/1.0"))

	client_id = st.text_input(
	"Client ID",
	value=default_client_id,
	type="password",
	help="Your Reddit API client ID (auto-populated from .env if available)"
	)
	client_secret = st.text_input(
	"Client Secret",
	value=default_client_secret,
	type="password",
	help="Your Reddit API client secret (auto-populated from .env if available)"
	)
	user_agent = st.text_input(
	"User Agent",
	value=default_user_agent,
	help="User agent string for API requests (auto-populated from .env if available)"
	)

	if st.button("Initialize Scraper", type="primary"):
	if client_id and client_secret:
	try:
	st.session_state.scraper = OptimizedRedditScraper(
	client_id, client_secret, user_agent
	)
	if ADVANCED_FEATURES:
	st.session_state.advanced_scraper = AdvancedRedditScraper(
	client_id, client_secret, user_agent
	)
	st.success("✅ Scrapers initialized successfully (with advanced features)!")
	else:
	st.success("✅ Scraper initialized successfully!")
	except Exception as e:
	st.error(f"❌ Failed to initialize: {e}")
	else:
	st.warning("⚠️ Please provide API credentials")

	# Data Collection Settings
	st.header("📥 Data Collection")

	# Show advanced mode only if features are available
	if ADVANCED_FEATURES:
	collection_mode = st.radio(
	"Collection Mode",
	["Single Subreddit", "Multiple Subreddits", "Advanced with Hierarchy"]
	)
	else:
	collection_mode = st.radio(
	"Collection Mode",
	["Single Subreddit", "Multiple Subreddits"]
	)

	if collection_mode == "Single Subreddit":
	subreddit_name = st.text_input("Subreddit Name", value="CUNY")
	subreddits = [subreddit_name]
	elif collection_mode == "Multiple Subreddits":
	subreddit_input = st.text_area(
	"Subreddits (one per line)",
	value="CUNY\nBaruch\nHunterCollege",
	height=100
	)
	subreddits = [s.strip() for s in subreddit_input.split('\n') if s.strip()]
	else:
	# Advanced with Hierarchy (only if ADVANCED_FEATURES is True)
	subreddit_name = st.text_input("Subreddit Name", value="CUNY")
	subreddits = [subreddit_name]
	use_checkpoint = st.checkbox("Enable checkpoint/resume", value=True)
	if use_checkpoint:
	checkpoint_name = st.text_input("Checkpoint name", value=f"{subreddit_name}_checkpoint")

	# Advanced settings
	with st.expander("⚙️ Advanced Settings"):
	sort_by = st.selectbox(
	"Sort By",
	["hot", "new", "top", "rising"],
	help="How to sort posts"
	)

	limit = st.slider(
	"Posts per Subreddit",
	min_value=50,
	max_value=500,
	value=200,
	step=50,
	help="Number of posts to fetch (optimized for 200+)"
	)

	if sort_by == "top":
	time_filter = st.selectbox(
	"Time Filter",
	["hour", "day", "week", "month", "year", "all"],
	index=3
	)
	else:
	time_filter = "month"

	batch_size = st.number_input(
	"Batch Size",
	min_value=10,
	max_value=50,
	value=25,
	help="Posts processed per batch"
	)

	cache_ttl = st.number_input(
	"Cache Duration (minutes)",
	min_value=5,
	max_value=120,
	value=60,
	help="How long to cache results"
	)

	# Main content area with tabs
	if st.session_state.scraper:

	# Create main tabs
	main_tab1, main_tab2 = st.tabs(["📥 Live Collection", "📊 Analytics & Metrics"])

	with main_tab1:
	st.header("Live Data Collection")

	col1, col2, col3 = st.columns(3)

	with col1:
	if st.button("🚀 Start Collection", type="primary", width="stretch"):

	# Initialize/clear stream posts
	st.session_state.stream_posts = []

	# Create display containers
	status_text = st.empty()
	progress_bar = st.progress(0)
	st.session_state.progress_bar = progress_bar

	# Fixed position containers for stats and stream
	stats_placeholder = st.empty()
	stream_placeholder = st.empty()

	status_text.info("🚀 Starting collection...")

	try:
	if collection_mode == "Advanced with Hierarchy":
	# Advanced scraping with comment hierarchies
	status_text.info(f"Advanced scraping r/{subreddits[0]}...")

	checkpoint = checkpoint_name if use_checkpoint else None
	results = st.session_state.advanced_scraper.scrape_with_hierarchy(
	subreddits[0], limit=limit, checkpoint_name=checkpoint
	)

	st.session_state.comment_hierarchies = results['hierarchies']

	# Convert to DataFrame
	df = pd.DataFrame(results['submissions'])
	if df.empty:
	df = pd.DataFrame()
	else:
	df['created_utc'] = pd.to_datetime(df['created_utc'])

	st.session_state.data = df
	status_text.success(f"✅ Scraped {len(results['submissions'])} posts with {len(results['comments'])} comments!")

	elif len(subreddits) == 1:
	# Standard single subreddit with streaming
	status_text.info(f"Collecting from r/{subreddits[0]}...")

	# Show header for stats
	with stats_placeholder:
	st.subheader("📊 Live Collection Progress")

	# Pass the placeholders to the scraper
	df = st.session_state.scraper.fetch_subreddit_data_verbose(
	subreddits[0], sort_by, limit, time_filter,
	(stats_placeholder, stream_placeholder)
	)
	st.session_state.data = df if not df.empty else pd.DataFrame()

	if len(df) > 0:
	status_text.success(f"✅ Collected {len(df)} posts!")
	else:
	status_text.warning("⚠️ No posts collected")
	else:
	# Multiple subreddits with streaming
	status_text.info(f"Collecting from {len(subreddits)} subreddits...")

	with stats_placeholder:
	st.subheader("📊 Live Collection Progress")

	all_data = []
	for idx, sub in enumerate(subreddits):
	status_text.info(f"Collecting {idx+1}/{len(subreddits)}: r/{sub}...")
	df = st.session_state.scraper.fetch_subreddit_data_verbose(
	sub, sort_by, limit, time_filter,
	(stats_placeholder, stream_placeholder)
	)
	all_data.append(df)

	if all_data:
	df = pd.concat(all_data, ignore_index=True)
	else:
	df = pd.DataFrame()

	st.session_state.data = df
	status_text.success(f"✅ Collected {len(df)} total posts!")

	st.session_state.last_fetch = datetime.now()

	except Exception as e:
	error_msg = str(e)
	# Don't show PRAW iterator exhaustion errors
	if "Ran out of input" not in error_msg and "prawcore" not in error_msg.lower():
	status_text.error(f"❌ Error: {error_msg}")
	elif st.session_state.data.empty:
	status_text.warning("⚠️ No posts could be fetched. Try adjusting your filters.")

	with col2:
	if not st.session_state.data.empty:
	st.download_button(
	"📥 Download CSV",
	st.session_state.data.to_csv(index=False),
	file_name=f"reddit_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
	mime="text/csv",
	width="stretch"
	)

	with col3:
	if st.session_state.last_fetch:
	st.info(f"Last: {st.session_state.last_fetch.strftime('%H:%M:%S')}")

	# Analytics & Metrics Tab
	with main_tab2:
	if not st.session_state.data.empty:
	df = st.session_state.data

	# Summary metrics at top
	st.header("📈 Summary Metrics")

	col1, col2, col3, col4, col5 = st.columns(5)

	with col1:
	st.metric("Total Posts", f"{len(df):,}")
	with col2:
	st.metric("Unique Authors", f"{df['author'].nunique():,}")
	with col3:
	st.metric("Avg Score", f"{df['score'].mean():.1f}")
	with col4:
	st.metric("Avg Comments", f"{df['num_comments'].mean():.1f}")
	with col5:
	st.metric("Subreddits", len(df['subreddit'].unique()))

	# Tabbed interface for different analyses
	tab1, tab2, tab3, tab4, tab5 = st.tabs([
	"📊 Temporal Analytics",
	"💬 Engagement Analysis",
	"📋 Raw Data",
	"🔍 Search & Filter",
	"🌳 Comment Hierarchies"
	])

	with tab1:
	st.header("Temporal Analytics")

	# Generate temporal visualizations
	temporal_figs = create_temporal_visualizations(df)

	# Activity heatmap
	st.plotly_chart(temporal_figs.get('heatmap'), use_container_width=True)

	# Time series
	st.plotly_chart(temporal_figs.get('timeline'), use_container_width=True)

	# Monthly trends
	col1, col2 = st.columns(2)
	with col1:
	st.plotly_chart(temporal_figs.get('monthly'), use_container_width=True)
	with col2:
	if 'flair' in temporal_figs:
	st.plotly_chart(temporal_figs.get('flair'), use_container_width=True)

	with tab2:
	st.header("Engagement Analysis")

	engagement_figs = create_engagement_analytics(df)

	col1, col2 = st.columns(2)
	with col1:
	st.plotly_chart(engagement_figs['score_dist'], use_container_width=True)
	with col2:
	st.plotly_chart(engagement_figs['correlation'], use_container_width=True)

	st.plotly_chart(engagement_figs['top_posts'], use_container_width=True)

	with tab3:
	st.header("Raw Data View")

	# Data filtering options
	col1, col2, col3 = st.columns(3)
	with col1:
	min_score = st.number_input("Min Score", value=0)
	with col2:
	min_comments = st.number_input("Min Comments", value=0)
	with col3:
	author_filter = st.text_input("Author Filter")

	# Apply filters
	filtered_df = df[
	(df['score'] >= min_score) &
	(df['num_comments'] >= min_comments)
	]

	if author_filter:
	filtered_df = filtered_df[
	filtered_df['author'].str.contains(author_filter, case=False, na=False)
	]

	st.dataframe(
	filtered_df[['title', 'author', 'score', 'num_comments',
	'created_utc', 'subreddit', 'flair']],
	width="stretch",
	height=500
	)

	st.info(f"Showing {len(filtered_df)} of {len(df)} posts")

	with tab4:
	st.header("Search & Filter")

	search_query = st.text_input("Search in titles and text", placeholder="Enter keywords...")

	if search_query:
	mask = (
	df['title'].str.contains(search_query, case=False, na=False) \|
	df['selftext'].str.contains(search_query, case=False, na=False)
	)
	search_results = df[mask]

	st.info(f"Found {len(search_results)} posts matching '{search_query}'")

	if not search_results.empty:
	for idx, row in search_results.head(10).iterrows():
	with st.expander(f"📝 {row['title'][:100]}..."):
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Score", row['score'])
	with col2:
	st.metric("Comments", row['num_comments'])
	with col3:
	st.metric("Date", row['created_utc'].strftime('%Y-%m-%d'))

	st.write(f"Author: u/{row['author']}")
	st.write(f"Subreddit: r/{row['subreddit']}")
	if row['selftext']:
	st.write(f"Text: {row['selftext'][:500]}...")
	st.write(f"[View on Reddit]({row['permalink']})")

	with tab5:
	st.header("Comment Hierarchies")

	if not ADVANCED_FEATURES:
	st.info("⚠️ Comment hierarchy analysis requires additional dependencies. This feature is optional and not needed for basic data collection.")
	elif st.session_state.comment_hierarchies:
	# Select submission to view
	submission_ids = list(st.session_state.comment_hierarchies.keys())
	selected_sub = st.selectbox("Select Submission", submission_ids)

	if selected_sub:
	hierarchy = st.session_state.comment_hierarchies[selected_sub]

	# Display submission info
	if hierarchy['submission']:
	st.subheader(f"📝 {hierarchy['submission'].get('title', 'No Title')}")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Score", hierarchy['submission'].get('score', 0))
	with col2:
	st.metric("Comments", len(hierarchy.get('comments', {})))
	with col3:
	st.metric("Author", hierarchy['submission'].get('author', '[deleted]'))

	# Visualize comment tree
	def display_comment_tree(comments, level=0):
	for comment_id, comment in comments.items():
	indent = " " * level
	with st.expander(f"{indent}💬 {comment.get('author', '[deleted]')} - Score: {comment.get('score', 0)}"):
	st.write(comment.get('body', '')[:500])
	if 'replies' in comment and comment['replies']:
	st.write("Replies:")
	display_comment_tree(comment['replies'], level + 1)

	st.subheader("Comment Thread Structure")
	if hierarchy.get('hierarchy'):
	display_comment_tree(hierarchy['hierarchy'])
	else:
	st.info("No comments found for this submission")

	# Orphan statistics
	if st.session_state.get('advanced_scraper'):
	orphan_stats = st.session_state.advanced_scraper.hierarchy_tracker.get_orphan_statistics()
	if orphan_stats['orphaned_count'] > 0:
	st.warning(f"⚠️ {orphan_stats['orphaned_count']} orphaned comments detected ({orphan_stats['orphan_rate']:.1%} orphan rate)")
	else:
	st.info("Use 'Advanced with Hierarchy' collection mode to analyze comment structures")
	else:
	# Empty state - no data collected yet
	st.info("👆 Configure your settings in the sidebar and click 'Start Collection' to begin")

	# Quick start guide
	with st.expander("🚀 Quick Start Guide"):
	st.markdown("""
	### Getting Started

	1. Set up API Credentials
	- Get your Reddit API credentials from [reddit.com/prefs/apps](https://www.reddit.com/prefs/apps)
	- Enter them in the sidebar
	- Click "Initialize Scraper"

	2. Choose Collection Mode
	- Single Subreddit: Analyze one community in depth
	- Multiple Subreddits: Collect from multiple communities

	3. Configure Settings
	- Adjust the number of posts (200+ recommended)
	- Choose sort method (hot, new, top, rising)
	- Set time filter for top posts

	4. Fetch & Analyze
	- Click "Fetch Data" to start collection
	- Explore temporal patterns, engagement metrics
	- Export results as CSV for further analysis

	### Features

	- Batch Processing: Efficiently handles 200+ posts
	- Caching: Reduces API calls with smart caching
	- Temporal Analytics: Hour/day/month patterns
	- Engagement Metrics: Score, comments, correlations
	""")

	else:
	st.warning("⚠️ Please initialize the scraper with your Reddit API credentials in the sidebar")

	# API setup instructions
	with st.expander("📖 How to get Reddit API credentials"):
	st.markdown("""
	### Setting up Reddit API Access

	1. Create a Reddit Account (if you don't have one)
	- Go to [reddit.com](https://www.reddit.com) and sign up

	2. Create an App
	- Visit [reddit.com/prefs/apps](https://www.reddit.com/prefs/apps)
	- Click "Create App" or "Create Another App"
	- Fill in the form:
	- Name: Your app name (e.g., "Research Dashboard")
	- App Type: Select "script"
	- Description: Optional
	- About URL: Leave blank
	- Redirect URI: http://localhost:8000
	- Click "Create app"

	3. Get Your Credentials
	- Client ID: The string under "personal use script"
	- Client Secret: The secret key shown
	- User Agent: Format: "Platform:AppName:Version (by /u/YourUsername)"

	4. Enter in Sidebar
	- Copy your credentials to the sidebar fields
	- Click "Initialize Scraper"
	""")

	if __name__ == "__main__":
	main()