coder-agent / app /model.py
Karan6933's picture
Upload 8 files
9c2f3d5 verified
# app/model.py
"""
CPU-optimized model loading with automatic GGUF download.
Uses llama.cpp for 2-4x faster inference on CPU.
"""
import gc
import os
from typing import Generator, Optional
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# Global singleton
_llama_model: Optional[Llama] = None
# Model configuration - Mungert/Nanbeige4-3B-Thinking-2511-GGUF
MODEL_REPO = "Mungert/Nanbeige4-3B-Thinking-2511-GGUF"
MODEL_FILE = "Nanbeige4-3B-Thinking-2511-iq2_m.gguf" # iq2_m = 2-bit, very fast, good quality
CACHE_DIR = "/tmp/models"
def download_gguf_model() -> str:
"""
Download GGUF model from Hugging Face if not exists.
Returns local path to model file.
"""
os.makedirs(CACHE_DIR, exist_ok=True)
local_path = os.path.join(CACHE_DIR, MODEL_FILE)
# Agar already downloaded hai
if os.path.exists(local_path):
print(f"GGUF model already exists: {local_path}")
print(f"Size: {os.path.getsize(local_path) / (1024*1024):.1f} MB")
return local_path
print(f"Downloading GGUF model: {MODEL_FILE}")
print(f"From: {MODEL_REPO}")
print("This may take a few minutes...")
try:
# Download from Hugging Face
downloaded_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILE,
cache_dir=CACHE_DIR,
local_dir=CACHE_DIR,
local_dir_use_symlinks=False
)
print(f"Model downloaded to: {downloaded_path}")
print(f"Size: {os.path.getsize(downloaded_path) / (1024*1024):.1f} MB")
return downloaded_path
except Exception as e:
print(f"Error downloading GGUF model: {e}")
raise
def load_model() -> Llama:
"""
Load GGUF model with llama.cpp (optimized for CPU).
Downloads automatically if not present.
"""
global _llama_model
if _llama_model is not None:
return _llama_model
# Download if needed
model_path = download_gguf_model()
print("Loading GGUF model with llama.cpp (CPU optimized)...")
print("Using iq2_m quantization (2-bit, very fast)")
# CPU optimizations for HF Spaces (2 vCPU, limited RAM)
_llama_model = Llama(
model_path=model_path,
n_ctx=2048, # Context window
n_threads=2, # HF Spaces free tier has 2 vCPUs
n_batch=256, # Smaller batch for memory efficiency
verbose=False, # Quiet mode
use_mmap=True, # Memory mapping for faster loading
use_mlock=False, # Don't lock memory (HF Spaces constraint)
)
print(f"Model loaded successfully!")
print(f"Threads: 2 | Context: 2048 | Quantization: iq2_m (2-bit)")
gc.collect()
return _llama_model
def generate_stream(
prompt: str,
temperature: float = 0.7,
max_tokens: int = 200
) -> Generator[str, None, None]:
"""
Streaming generation with llama.cpp (FAST).
"""
model = load_model()
# llama.cpp native streaming - very fast on CPU
stream = model(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.95,
stream=True,
stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
)
for output in stream:
text = output["choices"][0]["text"]
if text:
yield text
gc.collect()
def generate(
prompt: str,
temperature: float = 0.7,
max_tokens: int = 200
) -> str:
"""
Non-streaming generation with llama.cpp.
"""
model = load_model()
output = model(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.95,
stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
)
gc.collect()
return output["choices"][0]["text"]