Spaces:
Sleeping
Sleeping
File size: 3,855 Bytes
05944f9 538c943 05944f9 538c943 9c2f3d5 05944f9 538c943 47309bf 05944f9 538c943 9c2f3d5 05944f9 538c943 05944f9 538c943 05944f9 9c2f3d5 05944f9 538c943 05944f9 9c2f3d5 05944f9 538c943 05944f9 538c943 05944f9 538c943 05944f9 47309bf 05944f9 47309bf 05944f9 47309bf 05944f9 9c2f3d5 538c943 9c2f3d5 05944f9 9c2f3d5 05944f9 9c2f3d5 538c943 05944f9 538c943 05944f9 538c943 05944f9 47309bf 05944f9 47309bf 05944f9 9c2f3d5 05944f9 47309bf 05944f9 538c943 05944f9 47309bf 05944f9 538c943 05944f9 9c2f3d5 05944f9 538c943 05944f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# app/model.py
"""
CPU-optimized model loading with automatic GGUF download.
Uses llama.cpp for 2-4x faster inference on CPU.
"""
import gc
import os
from typing import Generator, Optional
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# Global singleton
_llama_model: Optional[Llama] = None
# Model configuration - Mungert/Nanbeige4-3B-Thinking-2511-GGUF
MODEL_REPO = "Mungert/Nanbeige4-3B-Thinking-2511-GGUF"
MODEL_FILE = "Nanbeige4-3B-Thinking-2511-iq2_m.gguf" # iq2_m = 2-bit, very fast, good quality
CACHE_DIR = "/tmp/models"
def download_gguf_model() -> str:
"""
Download GGUF model from Hugging Face if not exists.
Returns local path to model file.
"""
os.makedirs(CACHE_DIR, exist_ok=True)
local_path = os.path.join(CACHE_DIR, MODEL_FILE)
# Agar already downloaded hai
if os.path.exists(local_path):
print(f"GGUF model already exists: {local_path}")
print(f"Size: {os.path.getsize(local_path) / (1024*1024):.1f} MB")
return local_path
print(f"Downloading GGUF model: {MODEL_FILE}")
print(f"From: {MODEL_REPO}")
print("This may take a few minutes...")
try:
# Download from Hugging Face
downloaded_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILE,
cache_dir=CACHE_DIR,
local_dir=CACHE_DIR,
local_dir_use_symlinks=False
)
print(f"Model downloaded to: {downloaded_path}")
print(f"Size: {os.path.getsize(downloaded_path) / (1024*1024):.1f} MB")
return downloaded_path
except Exception as e:
print(f"Error downloading GGUF model: {e}")
raise
def load_model() -> Llama:
"""
Load GGUF model with llama.cpp (optimized for CPU).
Downloads automatically if not present.
"""
global _llama_model
if _llama_model is not None:
return _llama_model
# Download if needed
model_path = download_gguf_model()
print("Loading GGUF model with llama.cpp (CPU optimized)...")
print("Using iq2_m quantization (2-bit, very fast)")
# CPU optimizations for HF Spaces (2 vCPU, limited RAM)
_llama_model = Llama(
model_path=model_path,
n_ctx=2048, # Context window
n_threads=2, # HF Spaces free tier has 2 vCPUs
n_batch=256, # Smaller batch for memory efficiency
verbose=False, # Quiet mode
use_mmap=True, # Memory mapping for faster loading
use_mlock=False, # Don't lock memory (HF Spaces constraint)
)
print(f"Model loaded successfully!")
print(f"Threads: 2 | Context: 2048 | Quantization: iq2_m (2-bit)")
gc.collect()
return _llama_model
def generate_stream(
prompt: str,
temperature: float = 0.7,
max_tokens: int = 200
) -> Generator[str, None, None]:
"""
Streaming generation with llama.cpp (FAST).
"""
model = load_model()
# llama.cpp native streaming - very fast on CPU
stream = model(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.95,
stream=True,
stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
)
for output in stream:
text = output["choices"][0]["text"]
if text:
yield text
gc.collect()
def generate(
prompt: str,
temperature: float = 0.7,
max_tokens: int = 200
) -> str:
"""
Non-streaming generation with llama.cpp.
"""
model = load_model()
output = model(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.95,
stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
)
gc.collect()
return output["choices"][0]["text"] |