# main.py

import os
import io                     # For handling bytes data in memory
import yt_dlp                 # YouTube audio downloader
import requests               # For making HTTP requests (to audio URLs)
from fastapi import FastAPI, HTTPException, Request # The web framework
from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access
from pydantic import BaseModel # For data validation
from huggingface_hub import InferenceClient # HF API client
from dotenv import load_dotenv # To load .env file locally

# --- Initial Setup ---

# Load environment variables from .env file (for local development)
# In HF Spaces, secrets are set in the Space settings, not via .env
load_dotenv()

HF_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

# Check if the API key is loaded (crucial!)
if not HF_API_KEY:
    print("ERROR: HUGGINGFACE_API_KEY environment variable not found.")
    # I might want to exit or raise an error here in a real deployment
    # For now, we'll let it proceed but API calls will fail later.

# Define the models we'll use from Hugging Face
# I can change these! Smaller Whisper models (base, small, medium) are faster.
# Different LLMs have different strengths.
ASR_MODEL = "openai/whisper-large-v3"
LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"

# Initialize the Hugging Face Inference Client
# Handles authentication using the API key automatically
try:
    hf_inference = InferenceClient(token=HF_API_KEY)
    print("Hugging Face Inference Client initialized.")
except Exception as e:
    print(f"ERROR: Failed to initialize Hugging Face Inference Client: {e}")
    hf_inference = None # Ensure it's None if initialization fails

# Initialize the FastAPI application
app = FastAPI(
    title="Video Note Taker API",
    description="Transcribes videos and generates notes using Hugging Face models.",
    version="0.1.0",
)

# --- CORS Configuration ---
# Configure Cross-Origin Resource Sharing (CORS)
# This is VITAL to allow Vercel frontend (running on a different domain)
# to make requests to this backend API.
origins = [
    "http://localhost:3000", # Allow my local frontend dev server
    # !!! IMPORTANT: Add my DEPLOYED Vercel frontend URL here later !!!
    # Example: "https://videos-notes-app.vercel.app",
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins, # List of allowed origins
    allow_credentials=True, # Allow cookies (not strictly needed now, but good practice)
    allow_methods=["*"], # Allow all HTTP methods (GET, POST, etc.)
    allow_headers=["*"], # Allow all headers
)

# --- Data Models (Request Validation) ---

# Define the expected structure of the request body using Pydantic
class ProcessRequest(BaseModel):
    youtubeUrl: str # Expecting a field named "youtubeUrl" which is a string

def download_audio_bytes(youtube_url: str) -> bytes:
    
    # Downloads the best audio-only format from a YouTube URL using yt-dlp and returns the raw audio data as bytes.
    
    print(f"Attempting to download audio for: {youtube_url}")
    ydl_opts = {
        'format': 'bestaudio/best', # Prioritize best audio-only, fallback to best audio in general
        'noplaylist': True,       # Don't download playlist if URL is part of one
        'quiet': True,            # Suppress yt-dlp console output
        'no_warnings': True,
        'postprocessors': [{       # Use ffmpeg (if installed) to extract audio if needed
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3', # Request MP3 format (widely compatible)
            'preferredquality': '128', # Lower quality = smaller file = faster processing
        }],
        # Limit duration - uncomment and adjust if needed to prevent very long processing
        # 'download_ranges': yt_dlp.utils.download_range_func(None, [(0, 1200)]), # Example: Max 20 minutes (1200 seconds)
    }

    buffer = io.BytesIO() # Create an in-memory binary buffer

    try:
        # Use yt-dlp's ability to write to a file-like object
        ydl_opts['outtmpl'] = '-' # Special template meaning stdout
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             # Trick: Use a hook to capture stdout to our buffer instead of printing
             # This is complex; simpler method below is preferred if ffmpeg isn't used
             # Or, a better way: get the direct audio URL first

             # --- Simpler & Often Better Approach: Get URL, then download with requests ---
             info = ydl.extract_info(youtube_url, download=False) # Get info without downloading yet
             best_audio_format = None
             for f in info.get('formats', []):
                 # Look for formats processed by FFmpegExtractAudio or good audio codecs
                 if f.get('acodec') != 'none' and f.get('vcodec') == 'none': # Audio-only
                     if f.get('ext') in ['mp3', 'opus', 'm4a', 'webm']: # Prefer known good audio containers/codecs
                         best_audio_format = f
                         break # Take the first good one

             # Fallback if no ideal format found
             if not best_audio_format:
                 for f in info.get('formats', []):
                     if f.get('acodec') != 'none':
                         best_audio_format = f
                         break # Take first available audio

             if not best_audio_format or 'url' not in best_audio_format:
                 print("Could not find suitable audio stream URL via yt-dlp info. Direct download might fail or require ffmpeg.")
                 # If you *don't* have ffmpeg in the Dockerfile, the postprocessor might fail here
                 # Let's try the download anyway, it might work for some native formats
                 # This path is less reliable without guaranteed ffmpeg.
                 error_info = ydl.download([youtube_url]) # Try downloading directly (might need ffmpeg)
                 # This part is complex - capturing output might need more work if direct URL fetch failed.
                 # Let's raise an error if we couldn't get a direct URL for now.
                 raise yt_dlp.utils.DownloadError("Could not extract a direct audio URL and ffmpeg may not be available.")


             audio_url = best_audio_format['url']
             format_note = best_audio_format.get('format_note', best_audio_format.get('ext', 'N/A'))
             print(f"Found audio format: {format_note}. Downloading directly from URL...")

             # Download the audio URL content into the buffer
             with requests.get(audio_url, stream=True) as r:
                 r.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
                 for chunk in r.iter_content(chunk_size=8192):
                     buffer.write(chunk)

             audio_bytes = buffer.getvalue()
             print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB")
             if not audio_bytes:
                 raise ValueError("Downloaded audio data is empty.")
             return audio_bytes

    except yt_dlp.utils.DownloadError as e:
        print(f"ERROR: yt-dlp failed to download or process audio: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to download audio from YouTube: {e}")
    except requests.exceptions.RequestException as e:
        print(f"ERROR: Failed to download audio stream from URL: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to fetch audio stream: {e}")
    except Exception as e:
        print(f"ERROR: Unexpected error during audio download: {e}")
        # Log the full traceback here in a real app: import traceback; traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")