# app/model.py """ CPU-optimized model loading with automatic GGUF download. Uses llama.cpp for 2-4x faster inference on CPU. """ import gc import os from typing import Generator, Optional from huggingface_hub import hf_hub_download from llama_cpp import Llama # Global singleton _llama_model: Optional[Llama] = None # Model configuration - Mungert/Nanbeige4-3B-Thinking-2511-GGUF MODEL_REPO = "Mungert/Nanbeige4-3B-Thinking-2511-GGUF" MODEL_FILE = "Nanbeige4-3B-Thinking-2511-iq2_m.gguf" # iq2_m = 2-bit, very fast, good quality CACHE_DIR = "/tmp/models" def download_gguf_model() -> str: """ Download GGUF model from Hugging Face if not exists. Returns local path to model file. """ os.makedirs(CACHE_DIR, exist_ok=True) local_path = os.path.join(CACHE_DIR, MODEL_FILE) # Agar already downloaded hai if os.path.exists(local_path): print(f"GGUF model already exists: {local_path}") print(f"Size: {os.path.getsize(local_path) / (1024*1024):.1f} MB") return local_path print(f"Downloading GGUF model: {MODEL_FILE}") print(f"From: {MODEL_REPO}") print("This may take a few minutes...") try: # Download from Hugging Face downloaded_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, cache_dir=CACHE_DIR, local_dir=CACHE_DIR, local_dir_use_symlinks=False ) print(f"Model downloaded to: {downloaded_path}") print(f"Size: {os.path.getsize(downloaded_path) / (1024*1024):.1f} MB") return downloaded_path except Exception as e: print(f"Error downloading GGUF model: {e}") raise def load_model() -> Llama: """ Load GGUF model with llama.cpp (optimized for CPU). Downloads automatically if not present. """ global _llama_model if _llama_model is not None: return _llama_model # Download if needed model_path = download_gguf_model() print("Loading GGUF model with llama.cpp (CPU optimized)...") print("Using iq2_m quantization (2-bit, very fast)") # CPU optimizations for HF Spaces (2 vCPU, limited RAM) _llama_model = Llama( model_path=model_path, n_ctx=2048, # Context window n_threads=2, # HF Spaces free tier has 2 vCPUs n_batch=256, # Smaller batch for memory efficiency verbose=False, # Quiet mode use_mmap=True, # Memory mapping for faster loading use_mlock=False, # Don't lock memory (HF Spaces constraint) ) print(f"Model loaded successfully!") print(f"Threads: 2 | Context: 2048 | Quantization: iq2_m (2-bit)") gc.collect() return _llama_model def generate_stream( prompt: str, temperature: float = 0.7, max_tokens: int = 200 ) -> Generator[str, None, None]: """ Streaming generation with llama.cpp (FAST). """ model = load_model() # llama.cpp native streaming - very fast on CPU stream = model( prompt, max_tokens=max_tokens, temperature=temperature, top_p=0.95, stream=True, stop=["", "User:", "Human:", "Assistant:", "<|im_end|>"] ) for output in stream: text = output["choices"][0]["text"] if text: yield text gc.collect() def generate( prompt: str, temperature: float = 0.7, max_tokens: int = 200 ) -> str: """ Non-streaming generation with llama.cpp. """ model = load_model() output = model( prompt, max_tokens=max_tokens, temperature=temperature, top_p=0.95, stop=["", "User:", "Human:", "Assistant:", "<|im_end|>"] ) gc.collect() return output["choices"][0]["text"]