Spaces:

milwright
/

reddit-dashboard

Sleeping

File size: 54,596 Bytes

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import praw
import time
from datetime import datetime, timedelta
import json
import os
from typing import List, Dict, Any, Optional, Tuple
import concurrent.futures
from functools import lru_cache
import hashlib
import pytz
import sqlite3
import networkx as nx
from pathlib import Path
# Advanced features optional - will gracefully degrade if not available
try:
    from advanced_reddit_scraper import (
        AdvancedRedditScraper,
        ExponentialBackoff,
        CommentHierarchyTracker,
        CheckpointManager
    )
    ADVANCED_FEATURES = True
except ImportError:
    ADVANCED_FEATURES = False

def load_env_file(env_path: str = ".env") -> Dict[str, str]:
    """
    Load environment variables from .env file
    
    Args:
        env_path: Path to .env file
        
    Returns:
        Dictionary of environment variables
    """
    env_vars = {}
    env_file = Path(env_path)
    
    if env_file.exists():
        with open(env_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#') and '=' in line:
                    key, value = line.split('=', 1)
                    key = key.strip()
                    value = value.strip().strip('"').strip("'")
                    env_vars[key] = value
    
    return env_vars

st.set_page_config(
    page_title="Reddit Research Dashboard",
    page_icon="📊",
    layout="wide",
    initial_sidebar_state="expanded"
)

st.markdown("""
<style>
    /* Full width container */
    .main .block-container {
        max-width: 100%;
        padding-left: 2rem;
        padding-right: 2rem;
    }
    .main-header {
        font-size: 2.5rem;
        font-weight: bold;
        margin-bottom: 1rem;
        background: linear-gradient(90deg, #FF4500 0%, #FFA500 100%);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
    }
    .metric-card {
        background-color: #f0f2f6;
        padding: 1rem;
        border-radius: 0.5rem;
        border-left: 3px solid #FF4500;
    }
    .stProgress > div > div > div > div {
        background-color: #FF4500;
    }
    .info-box {
        background-color: #e8f4f8;
        padding: 1rem;
        border-radius: 0.5rem;
        margin: 1rem 0;
        border-left: 3px solid #1f77b4;
    }
    .stream-output {
        background-color: #f8f9fa;
        padding: 1rem;
        border-radius: 0.5rem;
        font-family: 'SF Mono', Monaco, monospace;
        font-size: 0.85rem;
        max-height: 600px;
        overflow-y: auto;
        margin: 1rem 0;
        border: 1px solid #dee2e6;
    }
    .stream-item {
        padding: 0.75rem;
        margin: 0.5rem 0;
        background: white;
        border-radius: 0.25rem;
        border-left: 3px solid #FF4500;
        box-shadow: 0 1px 3px rgba(0,0,0,0.1);
    }
    .stream-title {
        font-weight: bold;
        color: #1a1a1b;
        margin-bottom: 0.25rem;
    }
    .stream-meta {
        color: #787c7e;
        font-size: 0.8rem;
    }
    .stream-stats {
        background-color: #e8f4f8;
        padding: 0.5rem;
        border-radius: 0.25rem;
        margin: 0.5rem 0;
        font-size: 0.9rem;
    }
</style>
""", unsafe_allow_html=True)

class OptimizedRedditScraper:
    """
    Optimized Reddit scraper with batch processing, caching, and temporal analytics
    """
    
    def __init__(self, client_id: str, client_secret: str, user_agent: str):
        """Initialize with Reddit API credentials"""
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent,
            check_for_async=False
        )
        self.last_request_time = 0
        self.min_delay = 0.5
        
    def fetch_subreddit_data_verbose(self, subreddit_name: str, sort_by: str = "hot", 
                           limit: int = 200, time_filter: str = "month", 
                           log_container=None) -> pd.DataFrame:
        """
        Fetch Reddit data with verbose logging
        
        Args:
            subreddit_name: Name of subreddit to scrape
            sort_by: Sort method (hot, new, top, rising)
            limit: Number of posts to fetch (optimized for 200+ items)
            time_filter: Time filter for top posts
            log_container: Streamlit container for logging output
            
        Returns:
            DataFrame with Reddit posts data
        """
        def stream_post(post_data, stream_container):
            """Display a post as it's collected"""
            if stream_container:
                timestamp = datetime.now().strftime("%H:%M:%S")
                with stream_container.container():
                    with st.expander(f"📝 {post_data['title'][:80]}...", expanded=False):
                        col1, col2, col3, col4 = st.columns(4)
                        with col1:
                            st.metric("Score", post_data['score'])
                        with col2:
                            st.metric("Comments", post_data['num_comments'])
                        with col3:
                            st.text(f"u/{post_data['author']}")
                        with col4:
                            st.text(timestamp)
        
        def update_stats(stats_container, total, authors, comments):
            """Update collection statistics"""
            if stats_container:
                stats_container.empty()
                with stats_container:
                    col1, col2, col3 = st.columns(3)
                    with col1:
                        st.metric("📊 Posts", total)
                    with col2:
                        st.metric("👥 Authors", authors)
                    with col3:
                        st.metric("💬 Comments", f"{comments:,}")
        
        # Initialize streaming containers
        stats_container = None
        stream_container = None
        if log_container:
            # Check if log_container is a tuple of (stats, stream)
            if isinstance(log_container, tuple):
                stats_container, stream_container = log_container
            else:
                stats_container = log_container
                stream_container = log_container
        
        data = []
        try:
            subreddit = self.reddit.subreddit(subreddit_name)
            
            # Choose appropriate method based on sort_by
            if sort_by == "top":
                submissions = subreddit.top(limit=limit, time_filter=time_filter)
            elif sort_by == "new":
                submissions = subreddit.new(limit=limit)
            elif sort_by == "rising":
                submissions = subreddit.rising(limit=limit)
            else:
                submissions = subreddit.hot(limit=limit)
            
            # Batch processing with rate limiting
            batch_size = 25
            batch = []
            batch_num = 1
            post_count = 0
            total_comments = 0
            
            try:
                # Convert to list to handle iterator exhaustion gracefully
                submissions_list = []
                try:
                    for submission in submissions:
                        try:
                            # Force PRAW to load the submission by accessing an attribute
                            _ = submission.id
                            submissions_list.append(submission)
                            if len(submissions_list) >= limit:
                                break
                        except Exception as sub_error:
                            # Skip submissions that fail to load
                            continue
                except StopIteration:
                    pass  # Iterator exhausted naturally
                except Exception as fetch_error:
                    error_msg = str(fetch_error)
                    if "Ran out of input" in error_msg or "prawcore" in error_msg.lower():
                        # PRAW iterator exhausted - not an error, just end of data
                        pass
                    else:
                        if log_container:
                            st.warning(f"⚠️ Stopped early: {error_msg}")
                    if not submissions_list:
                        if log_container:
                            st.error(f"No data could be fetched: {error_msg}")
                        raise
                
                for i, submission in enumerate(submissions_list):
                    try:
                        # Rate limiting before fetching submission data
                        current_time = time.time()
                        if current_time - self.last_request_time < self.min_delay:
                            time.sleep(self.min_delay - (current_time - self.last_request_time))
                        self.last_request_time = time.time()
                        
                        batch.append(submission)
                        post_count += 1
                        
                        if len(batch) >= batch_size or post_count >= limit:
                            # Process batch
                            for idx, sub in enumerate(batch):
                                try:
                                    # Safely extract all attributes with error handling
                                    try:
                                        post_id = sub.id
                                        post_title = sub.title
                                        post_author = str(sub.author) if sub.author else '[deleted]'
                                        post_created = datetime.fromtimestamp(sub.created_utc, tz=pytz.UTC)
                                        post_score = sub.score
                                        post_comments = sub.num_comments
                                        post_ratio = sub.upvote_ratio
                                        post_text = sub.selftext[:500] if sub.selftext else ''
                                        post_url = sub.url
                                        post_flair = sub.link_flair_text or 'No Flair'
                                        post_video = sub.is_video
                                        post_self = sub.is_self
                                        post_permalink = f"https://reddit.com{sub.permalink}"
                                    except AttributeError as attr_error:
                                        # Missing attribute - skip this post
                                        continue
                                    except Exception as access_error:
                                        # Any other error accessing attributes - skip
                                        continue
                                    
                                    post_data = {
                                        'id': post_id,
                                        'title': post_title,
                                        'author': post_author,
                                        'created_utc': post_created,
                                        'score': post_score,
                                        'num_comments': post_comments,
                                        'upvote_ratio': post_ratio,
                                        'selftext': post_text,
                                        'url': post_url,
                                        'subreddit': subreddit_name,
                                        'flair': post_flair,
                                        'is_video': post_video,
                                        'is_self': post_self,
                                        'permalink': post_permalink
                                    }
                                    data.append(post_data)
                                    total_comments += post_data['num_comments']
                                    
                                    # Stream the post to UI
                                    stream_post(post_data, stream_container)
                                    
                                except Exception as post_error:
                                    # Skip posts that cause any error
                                    continue
                            # Update stats
                            if log_container:
                                unique_authors = len(set(d['author'] for d in data))
                                update_stats(stats_container, len(data), unique_authors, total_comments)
                            
                            batch = []
                            batch_num += 1
                            
                            # Update progress
                            if st.session_state.get('progress_bar'):
                                progress = min(post_count / limit, 1.0)
                                st.session_state.progress_bar.progress(progress)
                            
                            # Stop if we've reached the limit
                            if post_count >= limit:
                                break
                                
                    except StopIteration:
                        break
                    except Exception as iter_error:
                        continue
                
                # Process any remaining items in batch
                if batch:
                    for idx, sub in enumerate(batch):
                        try:
                            # Safely extract all attributes
                            try:
                                post_id = sub.id
                                post_title = sub.title
                                post_author = str(sub.author) if sub.author else '[deleted]'
                                post_created = datetime.fromtimestamp(sub.created_utc, tz=pytz.UTC)
                                post_score = sub.score
                                post_comments = sub.num_comments
                                post_ratio = sub.upvote_ratio
                                post_text = sub.selftext[:500] if sub.selftext else ''
                                post_url = sub.url
                                post_flair = sub.link_flair_text or 'No Flair'
                                post_video = sub.is_video
                                post_self = sub.is_self
                                post_permalink = f"https://reddit.com{sub.permalink}"
                            except Exception:
                                # Skip posts that fail attribute access
                                continue
                            
                            post_data = {
                                'id': post_id,
                                'title': post_title,
                                'author': post_author,
                                'created_utc': post_created,
                                'score': post_score,
                                'num_comments': post_comments,
                                'upvote_ratio': post_ratio,
                                'selftext': post_text,
                                'url': post_url,
                                'subreddit': subreddit_name,
                                'flair': post_flair,
                                'is_video': post_video,
                                'is_self': post_self,
                                'permalink': post_permalink
                            }
                            data.append(post_data)
                            total_comments += post_data['num_comments']
                            stream_post(post_data, stream_container)
                        except Exception:
                            # Skip any problematic posts
                            continue
                    
            except StopIteration:
                pass
            
            # Final stats update
            if log_container:
                unique_authors = len(set(d['author'] for d in data))
                update_stats(stats_container, len(data), unique_authors, total_comments)
            
        except Exception as e:
            error_msg = str(e)
            # Don't show scary errors for common PRAW issues
            if "Ran out of input" in error_msg or "prawcore" in error_msg.lower():
                if log_container and len(data) == 0:
                    st.warning("⚠️ No posts could be fetched. The subreddit may be empty or private.")
            else:
                if log_container:
                    st.error(f"❌ Error: {error_msg}")
                if len(data) == 0:  # Only raise if we got no data at all
                    raise
        
        # Return whatever data we managed to collect
        if len(data) == 0 and log_container:
            st.info("ℹ️ No posts were collected. Try adjusting your filters or selecting a different subreddit.")
        
        return pd.DataFrame(data)
    
    def fetch_subreddit_data(self, subreddit_name: str, sort_by: str = "hot", 
                           limit: int = 200, time_filter: str = "month") -> pd.DataFrame:
        """
        Fetch data with manual session-based caching
        """
        # Create cache key
        cache_key = f"{subreddit_name}_{sort_by}_{limit}_{time_filter}"
        
        # Check if data exists in session state cache
        if 'data_cache' not in st.session_state:
            st.session_state.data_cache = {}
        
        if cache_key in st.session_state.data_cache:
            cache_entry = st.session_state.data_cache[cache_key]
            # Check if cache is still valid (1 hour TTL)
            if (datetime.now() - cache_entry['timestamp']).total_seconds() < 3600:
                return cache_entry['data']
        
        # Fetch new data
        df = self.fetch_subreddit_data_verbose(subreddit_name, sort_by, limit, time_filter, None)
        
        # Store in cache
        st.session_state.data_cache[cache_key] = {
            'data': df,
            'timestamp': datetime.now()
        }
        
        return df
    
    def fetch_multiple_subreddits(self, subreddits: List[str], limit_per: int = 100,
                                 sort_by: str = "hot") -> pd.DataFrame:
        """
        Fetch data from multiple subreddits with manual caching
        
        Args:
            subreddits: List of subreddit names
            limit_per: Posts per subreddit
            sort_by: Sort method
            
        Returns:
            Combined DataFrame
        """
        # Create cache key
        cache_key = f"multi_{'_'.join(sorted(subreddits))}_{sort_by}_{limit_per}"
        
        # Check cache
        if 'data_cache' not in st.session_state:
            st.session_state.data_cache = {}
        
        if cache_key in st.session_state.data_cache:
            cache_entry = st.session_state.data_cache[cache_key]
            # Check if cache is still valid (30 min TTL)
            if (datetime.now() - cache_entry['timestamp']).total_seconds() < 1800:
                return cache_entry['data']
        
        # Fetch new data
        all_data = []
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
            future_to_sub = {
                executor.submit(self.fetch_subreddit_data, sub, sort_by, limit_per): sub 
                for sub in subreddits
            }
            
            for future in concurrent.futures.as_completed(future_to_sub):
                sub = future_to_sub[future]
                try:
                    data = future.result()
                    all_data.append(data)
                except Exception as e:
                    st.error(f"Error fetching r/{sub}: {e}")
        
        if all_data:
            df = pd.concat(all_data, ignore_index=True)
        else:
            df = pd.DataFrame()
        
        # Store in cache
        st.session_state.data_cache[cache_key] = {
            'data': df,
            'timestamp': datetime.now()
        }
        
        return df

def create_temporal_visualizations(df: pd.DataFrame) -> Dict[str, go.Figure]:
    """
    Create comprehensive temporal analytics visualizations
    
    Args:
        df: DataFrame with Reddit data
        
    Returns:
        Dictionary of Plotly figures
    """
    figures = {}
    
    # Ensure datetime column
    if 'created_utc' in df.columns:
        df['created_utc'] = pd.to_datetime(df['created_utc'])
        df = df.sort_values('created_utc')
    
    # Get actual date range of collected data with padding
    date_min = df['created_utc'].min()
    date_max = df['created_utc'].max()
    date_range = (date_max - date_min).days

    # Add 2% padding to prevent edge clipping
    padding = pd.Timedelta(days=max(1, int(date_range * 0.02)))
    date_min_padded = date_min - padding
    date_max_padded = date_max + padding
    
    # 1. Hourly activity heatmap
    df['hour'] = df['created_utc'].dt.hour
    df['day_of_week'] = df['created_utc'].dt.day_name()
    
    heatmap_data = df.groupby(['day_of_week', 'hour']).size().reset_index(name='count')
    pivot_data = heatmap_data.pivot(index='day_of_week', columns='hour', values='count').fillna(0)
    
    # Reorder days
    days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    pivot_data = pivot_data.reindex(days_order)
    
    fig_heatmap = go.Figure(data=go.Heatmap(
        z=pivot_data.values,
        x=pivot_data.columns,
        y=pivot_data.index,
        colorscale='RdYlBu_r',
        text=pivot_data.values.astype(int),
        texttemplate='%{text}',
        textfont={"size": 8},
        hovertemplate='%{y}<br>%{x}:00<br>Posts: %{z}<extra></extra>'
    ))
    
    fig_heatmap.update_layout(
        title='Activity Heatmap by Hour and Day',
        xaxis_title='Hour of Day',
        yaxis_title='Day of Week',
        height=400
    )
    figures['heatmap'] = fig_heatmap
    
    # 2. Time series with rolling average - only include days with actual data
    daily_stats = df.set_index('created_utc').resample('D').agg({
        'id': 'count',
        'score': 'mean',
        'num_comments': 'mean'
    }).rename(columns={'id': 'post_count'})

    # Filter out days with no posts to prevent misleading gaps
    daily_stats = daily_stats[daily_stats['post_count'] > 0]

    # Calculate rolling averages
    daily_stats['post_count_ma7'] = daily_stats['post_count'].rolling(window=7, min_periods=1).mean()
    daily_stats['score_ma7'] = daily_stats['score'].rolling(window=7, min_periods=1).mean()
    
    fig_timeline = make_subplots(
        rows=2, cols=1,
        subplot_titles=('Daily Post Activity', 'Average Engagement Metrics'),
        vertical_spacing=0.1
    )
    
    # Post count
    fig_timeline.add_trace(
        go.Scatter(x=daily_stats.index, y=daily_stats['post_count'],
                  mode='markers', name='Daily Posts', opacity=0.5,
                  marker=dict(size=5, color='lightblue')),
        row=1, col=1
    )

    fig_timeline.add_trace(
        go.Scatter(x=daily_stats.index, y=daily_stats['post_count_ma7'],
                  mode='lines', name='7-Day Average',
                  line=dict(color='blue', width=2),
                  connectgaps=False),
        row=1, col=1
    )

    # Engagement metrics
    fig_timeline.add_trace(
        go.Scatter(x=daily_stats.index, y=daily_stats['score_ma7'],
                  mode='lines', name='Avg Score (7-day)',
                  line=dict(color='orange'),
                  connectgaps=False),
        row=2, col=1
    )

    fig_timeline.add_trace(
        go.Scatter(x=daily_stats.index, y=daily_stats['num_comments'].rolling(window=7, min_periods=1).mean(),
                  mode='lines', name='Avg Comments (7-day)',
                  line=dict(color='green'),
                  connectgaps=False),
        row=2, col=1
    )
    
    fig_timeline.update_layout(
        height=600,
        showlegend=True,
        title=f'Activity Timeline ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})',
        xaxis=dict(type='date', autorange=True),
        xaxis2=dict(type='date', autorange=True)
    )
    fig_timeline.update_xaxes(title_text="Date", row=2, col=1)
    fig_timeline.update_yaxes(title_text="Count", row=1, col=1)
    fig_timeline.update_yaxes(title_text="Value", row=2, col=1)
    
    figures['timeline'] = fig_timeline
    
    # 3. Monthly trend analysis (only show if data spans at least 30 days)
    if date_range >= 30:
        monthly_data = df.set_index('created_utc').resample('M').agg({
            'id': 'count',
            'score': ['mean', 'sum'],
            'num_comments': ['mean', 'sum']
        })

        # Filter out months with no posts
        monthly_data = monthly_data[monthly_data[('id', 'count')] > 0]

        fig_monthly = go.Figure()

        fig_monthly.add_trace(go.Bar(
            x=monthly_data.index,
            y=monthly_data[('id', 'count')],
            name='Monthly Posts',
            marker_color='lightblue'
        ))
        
        fig_monthly.add_trace(go.Scatter(
            x=monthly_data.index,
            y=monthly_data[('score', 'mean')],
            name='Avg Score',
            yaxis='y2',
            line=dict(color='red', width=2),
            connectgaps=False
        ))
        
        fig_monthly.update_layout(
            title=f'Monthly Posting Trends ({date_min.strftime("%Y-%m")} to {date_max.strftime("%Y-%m")})',
            xaxis_title='Month',
            xaxis=dict(type='date', autorange=True),
            yaxis=dict(title='Post Count', side='left'),
            yaxis2=dict(title='Average Score', side='right', overlaying='y'),
            height=400,
            hovermode='x unified'
        )
        
        figures['monthly'] = fig_monthly
    else:
        # For shorter periods, show weekly trends instead
        weekly_data = df.set_index('created_utc').resample('W').agg({
            'id': 'count',
            'score': ['mean', 'sum'],
            'num_comments': ['mean', 'sum']
        })

        # Filter out weeks with no posts
        weekly_data = weekly_data[weekly_data[('id', 'count')] > 0]

        fig_weekly = go.Figure()

        fig_weekly.add_trace(go.Bar(
            x=weekly_data.index,
            y=weekly_data[('id', 'count')],
            name='Weekly Posts',
            marker_color='lightblue'
        ))
        
        fig_weekly.add_trace(go.Scatter(
            x=weekly_data.index,
            y=weekly_data[('score', 'mean')],
            name='Avg Score',
            yaxis='y2',
            line=dict(color='red', width=2),
            connectgaps=False
        ))
        
        fig_weekly.update_layout(
            title=f'Weekly Posting Trends ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})',
            xaxis_title='Week',
            xaxis=dict(type='date', autorange=True),
            yaxis=dict(title='Post Count', side='left'),
            yaxis2=dict(title='Average Score', side='right', overlaying='y'),
            height=400,
            hovermode='x unified'
        )
        
        figures['monthly'] = fig_weekly  # Use same key for consistency
    
    # 4. Posting patterns by flair
    if 'flair' in df.columns:
        flair_time = df.groupby([pd.Grouper(key='created_utc', freq='W'), 'flair']).size().reset_index(name='count')
        top_flairs = df['flair'].value_counts().head(10).index
        flair_time_filtered = flair_time[flair_time['flair'].isin(top_flairs)]
        
        fig_flair = px.line(flair_time_filtered, x='created_utc', y='count',
                           color='flair',
                           title=f'Weekly Posting Patterns by Flair ({date_min.strftime("%Y-%m-%d")} to {date_max.strftime("%Y-%m-%d")})',
                           labels={'count': 'Number of Posts', 'created_utc': 'Week'})
        fig_flair.update_layout(
            height=400,
            xaxis=dict(type='date', autorange=True)
        )
        figures['flair'] = fig_flair
    
    return figures

def create_engagement_analytics(df: pd.DataFrame) -> Dict[str, go.Figure]:
    """
    Create engagement and interaction analytics
    
    Args:
        df: DataFrame with Reddit data
        
    Returns:
        Dictionary of engagement figures
    """
    figures = {}
    
    # 1. Score distribution
    fig_score_dist = go.Figure()
    fig_score_dist.add_trace(go.Histogram(
        x=df['score'],
        nbinsx=50,
        name='Score Distribution',
        marker_color='orange'
    ))
    fig_score_dist.update_layout(
        title='Post Score Distribution',
        xaxis_title='Score',
        yaxis_title='Frequency',
        height=350
    )
    figures['score_dist'] = fig_score_dist
    
    # 2. Engagement correlation
    fig_correlation = px.scatter(
        df, x='score', y='num_comments',
        size='upvote_ratio', color='is_self',
        title='Score vs Comments Correlation',
        labels={'score': 'Post Score', 'num_comments': 'Number of Comments',
                'is_self': 'Post Type', 'upvote_ratio': 'Upvote Ratio'},
        hover_data=['title']
    )
    fig_correlation.update_layout(height=400)
    figures['correlation'] = fig_correlation
    
    # 3. Top performing posts
    top_posts = df.nlargest(10, 'score')[['title', 'score', 'num_comments', 'author']]
    
    fig_top = go.Figure(data=[
        go.Bar(name='Score', x=top_posts['title'].str[:30] + '...', y=top_posts['score']),
        go.Bar(name='Comments', x=top_posts['title'].str[:30] + '...', y=top_posts['num_comments'])
    ])
    fig_top.update_layout(
        title='Top 10 Posts by Engagement',
        barmode='group',
        height=400,
        xaxis_tickangle=-45
    )
    figures['top_posts'] = fig_top
    
    return figures

def main():
    """Main application function"""
    
    # Initialize session state
    if 'scraper' not in st.session_state:
        st.session_state.scraper = None
    if 'advanced_scraper' not in st.session_state:
        st.session_state.advanced_scraper = None
    if 'data' not in st.session_state:
        st.session_state.data = pd.DataFrame()
    if 'last_fetch' not in st.session_state:
        st.session_state.last_fetch = None
    if 'comment_hierarchies' not in st.session_state:
        st.session_state.comment_hierarchies = {}
    
    # Header
    st.markdown('<h1 class="main-header">📊 Reddit Research Dashboard</h1>', unsafe_allow_html=True)
    st.markdown("Optimized for high-volume data collection and temporal analytics")
    
    # Load .env file if it exists
    env_vars = load_env_file(".env")
    if not env_vars:
        # Try parent directory
        env_vars = load_env_file("../.env")
    
    # Sidebar configuration
    with st.sidebar:
        st.header("⚙️ Configuration")
        
        # Show .env detection status
        if env_vars:
            st.success("✅ .env file detected and loaded")
        
        # API Credentials - prioritize .env, then environment variables
        with st.expander("🔑 Reddit API Credentials", expanded=not bool(env_vars)):
            default_client_id = env_vars.get("REDDIT_CLIENT_ID", os.environ.get("REDDIT_CLIENT_ID", ""))
            default_client_secret = env_vars.get("REDDIT_CLIENT_SECRET", os.environ.get("REDDIT_CLIENT_SECRET", ""))
            default_user_agent = env_vars.get("REDDIT_USER_AGENT", os.environ.get("REDDIT_USER_AGENT", "RedditResearch/1.0"))
            
            client_id = st.text_input(
                "Client ID",
                value=default_client_id,
                type="password",
                help="Your Reddit API client ID (auto-populated from .env if available)"
            )
            client_secret = st.text_input(
                "Client Secret",
                value=default_client_secret,
                type="password",
                help="Your Reddit API client secret (auto-populated from .env if available)"
            )
            user_agent = st.text_input(
                "User Agent",
                value=default_user_agent,
                help="User agent string for API requests (auto-populated from .env if available)"
            )
            
            if st.button("Initialize Scraper", type="primary"):
                if client_id and client_secret:
                    try:
                        st.session_state.scraper = OptimizedRedditScraper(
                            client_id, client_secret, user_agent
                        )
                        if ADVANCED_FEATURES:
                            st.session_state.advanced_scraper = AdvancedRedditScraper(
                                client_id, client_secret, user_agent
                            )
                            st.success("✅ Scrapers initialized successfully (with advanced features)!")
                        else:
                            st.success("✅ Scraper initialized successfully!")
                    except Exception as e:
                        st.error(f"❌ Failed to initialize: {e}")
                else:
                    st.warning("⚠️ Please provide API credentials")
        
        # Data Collection Settings
        st.header("📥 Data Collection")
        
        # Show advanced mode only if features are available
        if ADVANCED_FEATURES:
            collection_mode = st.radio(
                "Collection Mode",
                ["Single Subreddit", "Multiple Subreddits", "Advanced with Hierarchy"]
            )
        else:
            collection_mode = st.radio(
                "Collection Mode",
                ["Single Subreddit", "Multiple Subreddits"]
            )

        if collection_mode == "Single Subreddit":
            subreddit_name = st.text_input("Subreddit Name", value="CUNY")
            subreddits = [subreddit_name]
        elif collection_mode == "Multiple Subreddits":
            subreddit_input = st.text_area(
                "Subreddits (one per line)",
                value="CUNY\nBaruch\nHunterCollege",
                height=100
            )
            subreddits = [s.strip() for s in subreddit_input.split('\n') if s.strip()]
        else:
            # Advanced with Hierarchy (only if ADVANCED_FEATURES is True)
            subreddit_name = st.text_input("Subreddit Name", value="CUNY")
            subreddits = [subreddit_name]
            use_checkpoint = st.checkbox("Enable checkpoint/resume", value=True)
            if use_checkpoint:
                checkpoint_name = st.text_input("Checkpoint name", value=f"{subreddit_name}_checkpoint")
        
        # Advanced settings
        with st.expander("⚙️ Advanced Settings"):
            sort_by = st.selectbox(
                "Sort By",
                ["hot", "new", "top", "rising"],
                help="How to sort posts"
            )
            
            limit = st.slider(
                "Posts per Subreddit",
                min_value=50,
                max_value=500,
                value=200,
                step=50,
                help="Number of posts to fetch (optimized for 200+)"
            )
            
            if sort_by == "top":
                time_filter = st.selectbox(
                    "Time Filter",
                    ["hour", "day", "week", "month", "year", "all"],
                    index=3
                )
            else:
                time_filter = "month"
            
            batch_size = st.number_input(
                "Batch Size",
                min_value=10,
                max_value=50,
                value=25,
                help="Posts processed per batch"
            )
            
            cache_ttl = st.number_input(
                "Cache Duration (minutes)",
                min_value=5,
                max_value=120,
                value=60,
                help="How long to cache results"
            )
    
    # Main content area with tabs
    if st.session_state.scraper:
        
        # Create main tabs
        main_tab1, main_tab2 = st.tabs(["📥 Live Collection", "📊 Analytics & Metrics"])
        
        with main_tab1:
            st.header("Live Data Collection")
            
            col1, col2, col3 = st.columns(3)
            
            with col1:
                if st.button("🚀 Start Collection", type="primary", width="stretch"):
                    
                    # Initialize/clear stream posts
                    st.session_state.stream_posts = []
                    
                    # Create display containers
                    status_text = st.empty()
                    progress_bar = st.progress(0)
                    st.session_state.progress_bar = progress_bar
                    
                    # Fixed position containers for stats and stream
                    stats_placeholder = st.empty()
                    stream_placeholder = st.empty()
                    
                    status_text.info("🚀 Starting collection...")
                    
                    try:
                        if collection_mode == "Advanced with Hierarchy":
                            # Advanced scraping with comment hierarchies
                            status_text.info(f"Advanced scraping r/{subreddits[0]}...")
                            
                            checkpoint = checkpoint_name if use_checkpoint else None
                            results = st.session_state.advanced_scraper.scrape_with_hierarchy(
                                subreddits[0], limit=limit, checkpoint_name=checkpoint
                            )
                            
                            st.session_state.comment_hierarchies = results['hierarchies']
                            
                            # Convert to DataFrame
                            df = pd.DataFrame(results['submissions'])
                            if df.empty:
                                df = pd.DataFrame()
                            else:
                                df['created_utc'] = pd.to_datetime(df['created_utc'])
                            
                            st.session_state.data = df
                            status_text.success(f"✅ Scraped {len(results['submissions'])} posts with {len(results['comments'])} comments!")
                            
                        elif len(subreddits) == 1:
                            # Standard single subreddit with streaming
                            status_text.info(f"Collecting from r/{subreddits[0]}...")
                            
                            # Show header for stats
                            with stats_placeholder:
                                st.subheader("📊 Live Collection Progress")
                            
                            # Pass the placeholders to the scraper
                            df = st.session_state.scraper.fetch_subreddit_data_verbose(
                                subreddits[0], sort_by, limit, time_filter, 
                                (stats_placeholder, stream_placeholder)
                            )
                            st.session_state.data = df if not df.empty else pd.DataFrame()
                            
                            if len(df) > 0:
                                status_text.success(f"✅ Collected {len(df)} posts!")
                            else:
                                status_text.warning("⚠️ No posts collected")
                        else:
                            # Multiple subreddits with streaming
                            status_text.info(f"Collecting from {len(subreddits)} subreddits...")
                            
                            with stats_placeholder:
                                st.subheader("📊 Live Collection Progress")
                            
                            all_data = []
                            for idx, sub in enumerate(subreddits):
                                status_text.info(f"Collecting {idx+1}/{len(subreddits)}: r/{sub}...")
                                df = st.session_state.scraper.fetch_subreddit_data_verbose(
                                    sub, sort_by, limit, time_filter, 
                                    (stats_placeholder, stream_placeholder)
                                )
                                all_data.append(df)
                            
                            if all_data:
                                df = pd.concat(all_data, ignore_index=True)
                            else:
                                df = pd.DataFrame()
                            
                            st.session_state.data = df
                            status_text.success(f"✅ Collected {len(df)} total posts!")
                        
                        st.session_state.last_fetch = datetime.now()
                        
                    except Exception as e:
                        error_msg = str(e)
                        # Don't show PRAW iterator exhaustion errors
                        if "Ran out of input" not in error_msg and "prawcore" not in error_msg.lower():
                            status_text.error(f"❌ Error: {error_msg}")
                        elif st.session_state.data.empty:
                            status_text.warning("⚠️ No posts could be fetched. Try adjusting your filters.")
            
            with col2:
                if not st.session_state.data.empty:
                    st.download_button(
                        "📥 Download CSV",
                        st.session_state.data.to_csv(index=False),
                        file_name=f"reddit_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
                        mime="text/csv",
                        width="stretch"
                    )
            
            with col3:
                if st.session_state.last_fetch:
                    st.info(f"Last: {st.session_state.last_fetch.strftime('%H:%M:%S')}")
        
        # Analytics & Metrics Tab
        with main_tab2:
            if not st.session_state.data.empty:
                df = st.session_state.data
                
                # Summary metrics at top
                st.header("📈 Summary Metrics")
                
                col1, col2, col3, col4, col5 = st.columns(5)
                
                with col1:
                    st.metric("Total Posts", f"{len(df):,}")
                with col2:
                    st.metric("Unique Authors", f"{df['author'].nunique():,}")
                with col3:
                    st.metric("Avg Score", f"{df['score'].mean():.1f}")
                with col4:
                    st.metric("Avg Comments", f"{df['num_comments'].mean():.1f}")
                with col5:
                    st.metric("Subreddits", len(df['subreddit'].unique()))
                
                # Tabbed interface for different analyses
                tab1, tab2, tab3, tab4, tab5 = st.tabs([
                    "📊 Temporal Analytics",
                    "💬 Engagement Analysis", 
                    "📋 Raw Data",
                    "🔍 Search & Filter",
                    "🌳 Comment Hierarchies"
                ])
                
                with tab1:
                    st.header("Temporal Analytics")
                
                    # Generate temporal visualizations
                    temporal_figs = create_temporal_visualizations(df)
                
                    # Activity heatmap
                    st.plotly_chart(temporal_figs.get('heatmap'), use_container_width=True)

                    # Time series
                    st.plotly_chart(temporal_figs.get('timeline'), use_container_width=True)

                    # Monthly trends
                    col1, col2 = st.columns(2)
                    with col1:
                        st.plotly_chart(temporal_figs.get('monthly'), use_container_width=True)
                    with col2:
                        if 'flair' in temporal_figs:
                            st.plotly_chart(temporal_figs.get('flair'), use_container_width=True)
                
                with tab2:
                    st.header("Engagement Analysis")
                
                    engagement_figs = create_engagement_analytics(df)

                    col1, col2 = st.columns(2)
                    with col1:
                        st.plotly_chart(engagement_figs['score_dist'], use_container_width=True)
                    with col2:
                        st.plotly_chart(engagement_figs['correlation'], use_container_width=True)

                    st.plotly_chart(engagement_figs['top_posts'], use_container_width=True)
            
                with tab3:
                    st.header("Raw Data View")
                
                    # Data filtering options
                    col1, col2, col3 = st.columns(3)
                    with col1:
                        min_score = st.number_input("Min Score", value=0)
                    with col2:
                        min_comments = st.number_input("Min Comments", value=0)
                    with col3:
                        author_filter = st.text_input("Author Filter")
                
                    # Apply filters
                    filtered_df = df[
                        (df['score'] >= min_score) &
                        (df['num_comments'] >= min_comments)
                    ]
                
                    if author_filter:
                        filtered_df = filtered_df[
                            filtered_df['author'].str.contains(author_filter, case=False, na=False)
                        ]
                
                    st.dataframe(
                        filtered_df[['title', 'author', 'score', 'num_comments', 
                                    'created_utc', 'subreddit', 'flair']],
                        width="stretch",
                        height=500
                    )
                
                    st.info(f"Showing {len(filtered_df)} of {len(df)} posts")
            
                with tab4:
                    st.header("Search & Filter")
                
                    search_query = st.text_input("Search in titles and text", placeholder="Enter keywords...")
                
                    if search_query:
                        mask = (
                            df['title'].str.contains(search_query, case=False, na=False) |
                            df['selftext'].str.contains(search_query, case=False, na=False)
                        )
                        search_results = df[mask]
                        
                        st.info(f"Found {len(search_results)} posts matching '{search_query}'")
                        
                        if not search_results.empty:
                            for idx, row in search_results.head(10).iterrows():
                                with st.expander(f"📝 {row['title'][:100]}..."):
                                    col1, col2, col3 = st.columns(3)
                                    with col1:
                                        st.metric("Score", row['score'])
                                    with col2:
                                        st.metric("Comments", row['num_comments'])
                                    with col3:
                                        st.metric("Date", row['created_utc'].strftime('%Y-%m-%d'))
                                    
                                    st.write(f"**Author:** u/{row['author']}")
                                    st.write(f"**Subreddit:** r/{row['subreddit']}")
                                    if row['selftext']:
                                        st.write(f"**Text:** {row['selftext'][:500]}...")
                                    st.write(f"[View on Reddit]({row['permalink']})")
            
                with tab5:
                    st.header("Comment Hierarchies")

                    if not ADVANCED_FEATURES:
                        st.info("⚠️ Comment hierarchy analysis requires additional dependencies. This feature is optional and not needed for basic data collection.")
                    elif st.session_state.comment_hierarchies:
                        # Select submission to view
                        submission_ids = list(st.session_state.comment_hierarchies.keys())
                        selected_sub = st.selectbox("Select Submission", submission_ids)

                        if selected_sub:
                            hierarchy = st.session_state.comment_hierarchies[selected_sub]

                            # Display submission info
                            if hierarchy['submission']:
                                st.subheader(f"📝 {hierarchy['submission'].get('title', 'No Title')}")
                                col1, col2, col3 = st.columns(3)
                                with col1:
                                    st.metric("Score", hierarchy['submission'].get('score', 0))
                                with col2:
                                    st.metric("Comments", len(hierarchy.get('comments', {})))
                                with col3:
                                    st.metric("Author", hierarchy['submission'].get('author', '[deleted]'))

                        # Visualize comment tree
                        def display_comment_tree(comments, level=0):
                            for comment_id, comment in comments.items():
                                indent = "  " * level
                                with st.expander(f"{indent}💬 {comment.get('author', '[deleted]')} - Score: {comment.get('score', 0)}"):
                                    st.write(comment.get('body', '')[:500])
                                    if 'replies' in comment and comment['replies']:
                                        st.write("**Replies:**")
                                        display_comment_tree(comment['replies'], level + 1)

                        st.subheader("Comment Thread Structure")
                        if hierarchy.get('hierarchy'):
                            display_comment_tree(hierarchy['hierarchy'])
                        else:
                            st.info("No comments found for this submission")

                            # Orphan statistics
                            if st.session_state.get('advanced_scraper'):
                                orphan_stats = st.session_state.advanced_scraper.hierarchy_tracker.get_orphan_statistics()
                                if orphan_stats['orphaned_count'] > 0:
                                    st.warning(f"⚠️ {orphan_stats['orphaned_count']} orphaned comments detected ({orphan_stats['orphan_rate']:.1%} orphan rate)")
                    else:
                        st.info("Use 'Advanced with Hierarchy' collection mode to analyze comment structures")
            else:
                # Empty state - no data collected yet
                st.info("👆 Configure your settings in the sidebar and click 'Start Collection' to begin")
            
            # Quick start guide
            with st.expander("🚀 Quick Start Guide"):
                st.markdown("""
                ### Getting Started
                
                1. **Set up API Credentials**
                   - Get your Reddit API credentials from [reddit.com/prefs/apps](https://www.reddit.com/prefs/apps)
                   - Enter them in the sidebar
                   - Click "Initialize Scraper"
                
                2. **Choose Collection Mode**
                   - **Single Subreddit**: Analyze one community in depth
                   - **Multiple Subreddits**: Collect from multiple communities
                
                3. **Configure Settings**
                   - Adjust the number of posts (200+ recommended)
                   - Choose sort method (hot, new, top, rising)
                   - Set time filter for top posts
                
                4. **Fetch & Analyze**
                   - Click "Fetch Data" to start collection
                   - Explore temporal patterns, engagement metrics
                   - Export results as CSV for further analysis
                
                ### Features
                
                - **Batch Processing**: Efficiently handles 200+ posts
                - **Caching**: Reduces API calls with smart caching
                - **Temporal Analytics**: Hour/day/month patterns
                - **Engagement Metrics**: Score, comments, correlations
                """)
    
    else:
        st.warning("⚠️ Please initialize the scraper with your Reddit API credentials in the sidebar")
        
        # API setup instructions
        with st.expander("📖 How to get Reddit API credentials"):
            st.markdown("""
            ### Setting up Reddit API Access
            
            1. **Create a Reddit Account** (if you don't have one)
               - Go to [reddit.com](https://www.reddit.com) and sign up
            
            2. **Create an App**
               - Visit [reddit.com/prefs/apps](https://www.reddit.com/prefs/apps)
               - Click "Create App" or "Create Another App"
               - Fill in the form:
                 - **Name**: Your app name (e.g., "Research Dashboard")
                 - **App Type**: Select "script"
                 - **Description**: Optional
                 - **About URL**: Leave blank
                 - **Redirect URI**: http://localhost:8000
               - Click "Create app"
            
            3. **Get Your Credentials**
               - **Client ID**: The string under "personal use script"
               - **Client Secret**: The secret key shown
               - **User Agent**: Format: "Platform:AppName:Version (by /u/YourUsername)"
            
            4. **Enter in Sidebar**
               - Copy your credentials to the sidebar fields
               - Click "Initialize Scraper"
            """)

if __name__ == "__main__":
    main()