# app/model.py
"""
CPU-optimized model loading with automatic GGUF download.
Uses llama.cpp for 2-4x faster inference on CPU.
"""

import gc
import os
from typing import Generator, Optional

from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Global singleton
_llama_model: Optional[Llama] = None

# Model configuration - Mungert/Nanbeige4-3B-Thinking-2511-GGUF
MODEL_REPO = "Mungert/Nanbeige4-3B-Thinking-2511-GGUF"
MODEL_FILE = "Nanbeige4-3B-Thinking-2511-iq2_m.gguf"  # iq2_m = 2-bit, very fast, good quality
CACHE_DIR = "/tmp/models"


def download_gguf_model() -> str:
    """
    Download GGUF model from Hugging Face if not exists.
    Returns local path to model file.
    """
    os.makedirs(CACHE_DIR, exist_ok=True)
    local_path = os.path.join(CACHE_DIR, MODEL_FILE)
    
    # Agar already downloaded hai
    if os.path.exists(local_path):
        print(f"GGUF model already exists: {local_path}")
        print(f"Size: {os.path.getsize(local_path) / (1024*1024):.1f} MB")
        return local_path
    
    print(f"Downloading GGUF model: {MODEL_FILE}")
    print(f"From: {MODEL_REPO}")
    print("This may take a few minutes...")
    
    try:
        # Download from Hugging Face
        downloaded_path = hf_hub_download(
            repo_id=MODEL_REPO,
            filename=MODEL_FILE,
            cache_dir=CACHE_DIR,
            local_dir=CACHE_DIR,
            local_dir_use_symlinks=False
        )
        print(f"Model downloaded to: {downloaded_path}")
        print(f"Size: {os.path.getsize(downloaded_path) / (1024*1024):.1f} MB")
        return downloaded_path
        
    except Exception as e:
        print(f"Error downloading GGUF model: {e}")
        raise


def load_model() -> Llama:
    """
    Load GGUF model with llama.cpp (optimized for CPU).
    Downloads automatically if not present.
    """
    global _llama_model
    
    if _llama_model is not None:
        return _llama_model
    
    # Download if needed
    model_path = download_gguf_model()
    
    print("Loading GGUF model with llama.cpp (CPU optimized)...")
    print("Using iq2_m quantization (2-bit, very fast)")
    
    # CPU optimizations for HF Spaces (2 vCPU, limited RAM)
    _llama_model = Llama(
        model_path=model_path,
        n_ctx=2048,        # Context window
        n_threads=2,       # HF Spaces free tier has 2 vCPUs
        n_batch=256,       # Smaller batch for memory efficiency
        verbose=False,     # Quiet mode
        use_mmap=True,     # Memory mapping for faster loading
        use_mlock=False,   # Don't lock memory (HF Spaces constraint)
    )
    
    print(f"Model loaded successfully!")
    print(f"Threads: 2 | Context: 2048 | Quantization: iq2_m (2-bit)")
    
    gc.collect()
    return _llama_model


def generate_stream(
    prompt: str,
    temperature: float = 0.7,
    max_tokens: int = 200
) -> Generator[str, None, None]:
    """
    Streaming generation with llama.cpp (FAST).
    """
    model = load_model()
    
    # llama.cpp native streaming - very fast on CPU
    stream = model(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=0.95,
        stream=True,
        stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
    )
    
    for output in stream:
        text = output["choices"][0]["text"]
        if text:
            yield text
    
    gc.collect()


def generate(
    prompt: str,
    temperature: float = 0.7,
    max_tokens: int = 200
) -> str:
    """
    Non-streaming generation with llama.cpp.
    """
    model = load_model()
    
    output = model(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=0.95,
        stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
    )
    
    gc.collect()
    return output["choices"][0]["text"]