Spaces:
Sleeping
Sleeping
| # app/model.py | |
| """ | |
| CPU-optimized model loading with automatic GGUF download. | |
| Uses llama.cpp for 2-4x faster inference on CPU. | |
| """ | |
| import gc | |
| import os | |
| from typing import Generator, Optional | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| # Global singleton | |
| _llama_model: Optional[Llama] = None | |
| # Model configuration - Mungert/Nanbeige4-3B-Thinking-2511-GGUF | |
| MODEL_REPO = "Mungert/Nanbeige4-3B-Thinking-2511-GGUF" | |
| MODEL_FILE = "Nanbeige4-3B-Thinking-2511-iq2_m.gguf" # iq2_m = 2-bit, very fast, good quality | |
| CACHE_DIR = "/tmp/models" | |
| def download_gguf_model() -> str: | |
| """ | |
| Download GGUF model from Hugging Face if not exists. | |
| Returns local path to model file. | |
| """ | |
| os.makedirs(CACHE_DIR, exist_ok=True) | |
| local_path = os.path.join(CACHE_DIR, MODEL_FILE) | |
| # Agar already downloaded hai | |
| if os.path.exists(local_path): | |
| print(f"GGUF model already exists: {local_path}") | |
| print(f"Size: {os.path.getsize(local_path) / (1024*1024):.1f} MB") | |
| return local_path | |
| print(f"Downloading GGUF model: {MODEL_FILE}") | |
| print(f"From: {MODEL_REPO}") | |
| print("This may take a few minutes...") | |
| try: | |
| # Download from Hugging Face | |
| downloaded_path = hf_hub_download( | |
| repo_id=MODEL_REPO, | |
| filename=MODEL_FILE, | |
| cache_dir=CACHE_DIR, | |
| local_dir=CACHE_DIR, | |
| local_dir_use_symlinks=False | |
| ) | |
| print(f"Model downloaded to: {downloaded_path}") | |
| print(f"Size: {os.path.getsize(downloaded_path) / (1024*1024):.1f} MB") | |
| return downloaded_path | |
| except Exception as e: | |
| print(f"Error downloading GGUF model: {e}") | |
| raise | |
| def load_model() -> Llama: | |
| """ | |
| Load GGUF model with llama.cpp (optimized for CPU). | |
| Downloads automatically if not present. | |
| """ | |
| global _llama_model | |
| if _llama_model is not None: | |
| return _llama_model | |
| # Download if needed | |
| model_path = download_gguf_model() | |
| print("Loading GGUF model with llama.cpp (CPU optimized)...") | |
| print("Using iq2_m quantization (2-bit, very fast)") | |
| # CPU optimizations for HF Spaces (2 vCPU, limited RAM) | |
| _llama_model = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, # Context window | |
| n_threads=2, # HF Spaces free tier has 2 vCPUs | |
| n_batch=256, # Smaller batch for memory efficiency | |
| verbose=False, # Quiet mode | |
| use_mmap=True, # Memory mapping for faster loading | |
| use_mlock=False, # Don't lock memory (HF Spaces constraint) | |
| ) | |
| print(f"Model loaded successfully!") | |
| print(f"Threads: 2 | Context: 2048 | Quantization: iq2_m (2-bit)") | |
| gc.collect() | |
| return _llama_model | |
| def generate_stream( | |
| prompt: str, | |
| temperature: float = 0.7, | |
| max_tokens: int = 200 | |
| ) -> Generator[str, None, None]: | |
| """ | |
| Streaming generation with llama.cpp (FAST). | |
| """ | |
| model = load_model() | |
| # llama.cpp native streaming - very fast on CPU | |
| stream = model( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.95, | |
| stream=True, | |
| stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"] | |
| ) | |
| for output in stream: | |
| text = output["choices"][0]["text"] | |
| if text: | |
| yield text | |
| gc.collect() | |
| def generate( | |
| prompt: str, | |
| temperature: float = 0.7, | |
| max_tokens: int = 200 | |
| ) -> str: | |
| """ | |
| Non-streaming generation with llama.cpp. | |
| """ | |
| model = load_model() | |
| output = model( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.95, | |
| stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"] | |
| ) | |
| gc.collect() | |
| return output["choices"][0]["text"] |