import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
from requests.sessions import Session
from langdetect import detect, LangDetectException
from deep_translator import GoogleTranslator
from requests.exceptions import RequestException
import time

# --- CONSTANTS ---
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5"
}

# --- HELPER FUNCTIONS ---

def get_session():
    """Creates a session with standard headers."""
    session = Session()
    session.headers.update(HEADERS)
    return session

def perform_login(session, login_url, email, password):
    """
    Attempts a login. 
    ENHANCEMENT: Fetches the page first to find hidden CSRF tokens.
    """
    try:
        # 1. Get the login page first to set cookies/CSRF tokens
        page = session.get(login_url, timeout=10)
        page.raise_for_status()
        
        soup = BeautifulSoup(page.content, 'html.parser')
        payload = {
            'email': email,       # Note: Some sites use 'username', this is specific to target
            'password': password
        }

        # 2. Find hidden inputs (often required for CSRF protection)
        form = soup.find('form')
        if form:
            for input_tag in form.find_all('input', type='hidden'):
                name = input_tag.get('name')
                value = input_tag.get('value')
                if name and value:
                    payload[name] = value
        
        # 3. Post credentials
        response = session.post(login_url, data=payload, timeout=10)
        response.raise_for_status()
        return True, "Login request sent."
    except Exception as e:
        return False, str(e)

def chunk_text(text, max_chars=4500):
    """Splits text into chunks to respect translation API limits."""
    return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

def translate_content(text, target_lang='en'):
    """
    Translates text using Deep Translator (more stable).
    Optimized to check language first, then translate in chunks.
    """
    if not text or len(text) < 5:
        return text

    try:
        # Check language first to avoid unnecessary API calls
        detected_lang = detect(text[:1000]) # Detect based on first 1000 chars
        if detected_lang == target_lang:
            return text
    except LangDetectException:
        pass # If detection fails, attempt translation anyway

    translator = GoogleTranslator(source='auto', target=target_lang)
    chunks = chunk_text(text)
    translated_chunks = []

    progress_bar = st.progress(0)
    
    for i, chunk in enumerate(chunks):
        try:
            translated = translator.translate(chunk)
            translated_chunks.append(translated)
        except Exception:
            # Fallback: append original if translation fails
            translated_chunks.append(chunk)
        
        # Update progress
        progress_bar.progress((i + 1) / len(chunks))

    progress_bar.empty() # Remove bar when done
    return " ".join(translated_chunks)

def scrape_url(url, query_selector=None, auth_details=None):
    session = get_session()

    # --- Authentication Phase ---
    if auth_details and auth_details.get('login_url'):
        st.info("🔄 Attempting authentication...")
        success, msg = perform_login(
            session, 
            auth_details['login_url'], 
            auth_details['email'], 
            auth_details['password']
        )
        if not success:
            st.error(f"Authentication failed (proceeding as guest): {msg}")
        else:
            st.success("Authentication request sent (Session updated).")

    # --- Fetching Phase ---
    try:
        response = session.get(url, timeout=15)
        response.raise_for_status()
    except RequestException as e:
        return None, f"Network Error: {e}"

    # --- Parsing Phase ---
    soup = BeautifulSoup(response.content, 'html.parser')

    # Remove clutter
    for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "nav", "aside", "svg", "iframe", "ad"]):
        tag.decompose() # decompose is faster than extract

    # Extraction
    if query_selector:
        elements = soup.select(query_selector)
        if not elements:
            return None, "Query selector found no elements."
        # Get text with separator to prevent words merging
        text_content = "\n".join([el.get_text(separator=' ', strip=True) for el in elements])
    else:
        # Get all visible text from body, using separator
        if soup.body:
            text_content = soup.body.get_text(separator=' ', strip=True)
        else:
            text_content = soup.get_text(separator=' ', strip=True)

    # Clean up whitespace
    clean_text = re.sub(r'\s+', ' ', text_content).strip()
    
    return clean_text, None

# --- MAIN APP ---

def main():
    st.set_page_config(page_title="Universal Web Scraper", page_icon="🕷️", layout="wide")
    
    st.title("🕷️ Universal Web Scraper & Translator")
    st.markdown("---")

    # --- Sidebar Inputs ---
    with st.sidebar:
        st.header("⚙️ Configuration")
        url_input = st.text_input("Target URL", placeholder="https://example.com")
        query_selector = st.text_input("CSS Selector (Optional)", placeholder="e.g., div.article-content")
        enable_translation = st.checkbox("Enable Auto-Translation", value=True)
        
        with st.expander("🔐 Authentication (Advanced)"):
            st.caption("Only use if the content is behind a login.")
            login_url = st.text_input("Login Page URL")
            email = st.text_input("Email/Username")
            password = st.text_input("Password", type="password")

    # --- Main Logic ---
    if st.button("🚀 Start Scraping", type="primary"):
        if not url_input:
            st.warning("Please enter a URL to proceed.")
            return

        auth_details = None
        if login_url and email and password:
            auth_details = {'login_url': login_url, 'email': email, 'password': password}

        with st.spinner("Fetching and processing data..."):
            # 1. Scrape
            scraped_text, error = scrape_url(url_input, query_selector, auth_details)

            if error:
                st.error(error)
            elif not scraped_text:
                st.warning("No text content found at this URL. The site might be JavaScript-rendered (SPA).")
            else:
                # 2. Translate (if enabled)
                final_text = scraped_text
                if enable_translation:
                    with st.status("Detecting language and translating...", expanded=True) as status:
                        final_text = translate_content(scraped_text)
                        status.update(label="Processing Complete!", state="complete", expanded=False)

                # 3. Display Results
                st.success(f"Successfully extracted {len(final_text)} characters.")
                
                tab1, tab2 = st.tabs(["📄 Cleaned Text", "🔍 Raw Preview"])
                
                with tab1:
                    st.text_area("Content", final_text, height=400)
                    
                    # Download Button
                    st.download_button(
                        label="📥 Download Text",
                        data=final_text,
                        file_name="scraped_data.txt",
                        mime="text/plain"
                    )
                
                with tab2:
                    st.json({
                        "source": url_input,
                        "length": len(final_text),
                        "selector_used": query_selector if query_selector else "Body (Default)",
                        "snippet": final_text[:500] + "..."
                    })

if __name__ == "__main__":
    main()