Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- app/main.py +5 -3
- app/model.py +13 -16
app/main.py
CHANGED
|
@@ -19,6 +19,7 @@ async def lifespan(app: FastAPI):
|
|
| 19 |
"""Startup: Download and load model."""
|
| 20 |
print("=" * 50)
|
| 21 |
print("Starting up - Loading GGUF model...")
|
|
|
|
| 22 |
print("=" * 50)
|
| 23 |
load_model() # Pre-load on startup
|
| 24 |
print("Ready for requests!")
|
|
@@ -27,8 +28,8 @@ async def lifespan(app: FastAPI):
|
|
| 27 |
|
| 28 |
|
| 29 |
app = FastAPI(
|
| 30 |
-
title="
|
| 31 |
-
description="Fast CPU inference with llama.cpp",
|
| 32 |
version="2.0.0",
|
| 33 |
lifespan=lifespan
|
| 34 |
)
|
|
@@ -38,9 +39,10 @@ app = FastAPI(
|
|
| 38 |
async def health_check():
|
| 39 |
return {
|
| 40 |
"status": "ok",
|
| 41 |
-
"model": "
|
| 42 |
"backend": "llama.cpp",
|
| 43 |
"device": "cpu",
|
|
|
|
| 44 |
"optimized": True
|
| 45 |
}
|
| 46 |
|
|
|
|
| 19 |
"""Startup: Download and load model."""
|
| 20 |
print("=" * 50)
|
| 21 |
print("Starting up - Loading GGUF model...")
|
| 22 |
+
print("Model: Mungert/Nanbeige4-3B-Thinking-2511-GGUF")
|
| 23 |
print("=" * 50)
|
| 24 |
load_model() # Pre-load on startup
|
| 25 |
print("Ready for requests!")
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
app = FastAPI(
|
| 31 |
+
title="Nanbeige4-3B-Thinking-GGUF API",
|
| 32 |
+
description="Fast CPU inference with llama.cpp (iq2_m quantized)",
|
| 33 |
version="2.0.0",
|
| 34 |
lifespan=lifespan
|
| 35 |
)
|
|
|
|
| 39 |
async def health_check():
|
| 40 |
return {
|
| 41 |
"status": "ok",
|
| 42 |
+
"model": "Nanbeige4-3B-Thinking-2511-iq2_m",
|
| 43 |
"backend": "llama.cpp",
|
| 44 |
"device": "cpu",
|
| 45 |
+
"quantization": "iq2_m",
|
| 46 |
"optimized": True
|
| 47 |
}
|
| 48 |
|
app/model.py
CHANGED
|
@@ -7,20 +7,16 @@ Uses llama.cpp for 2-4x faster inference on CPU.
|
|
| 7 |
import gc
|
| 8 |
import os
|
| 9 |
from typing import Generator, Optional
|
| 10 |
-
from pathlib import Path
|
| 11 |
|
| 12 |
-
from huggingface_hub import hf_hub_download
|
| 13 |
from llama_cpp import Llama
|
| 14 |
|
| 15 |
# Global singleton
|
| 16 |
_llama_model: Optional[Llama] = None
|
| 17 |
|
| 18 |
-
# Model configuration
|
| 19 |
-
MODEL_REPO = "
|
| 20 |
-
MODEL_FILE = "
|
| 21 |
-
# Agar yeh nahi chale toh: "nanbeige-3b.Q4_0.gguf" (faster, less quality)
|
| 22 |
-
# Ya: "nanbeige-3b.Q5_K_M.gguf" (better quality, slower)
|
| 23 |
-
|
| 24 |
CACHE_DIR = "/tmp/models"
|
| 25 |
|
| 26 |
|
|
@@ -35,6 +31,7 @@ def download_gguf_model() -> str:
|
|
| 35 |
# Agar already downloaded hai
|
| 36 |
if os.path.exists(local_path):
|
| 37 |
print(f"GGUF model already exists: {local_path}")
|
|
|
|
| 38 |
return local_path
|
| 39 |
|
| 40 |
print(f"Downloading GGUF model: {MODEL_FILE}")
|
|
@@ -51,11 +48,11 @@ def download_gguf_model() -> str:
|
|
| 51 |
local_dir_use_symlinks=False
|
| 52 |
)
|
| 53 |
print(f"Model downloaded to: {downloaded_path}")
|
|
|
|
| 54 |
return downloaded_path
|
| 55 |
|
| 56 |
except Exception as e:
|
| 57 |
print(f"Error downloading GGUF model: {e}")
|
| 58 |
-
print("Falling back to smaller model or available alternative...")
|
| 59 |
raise
|
| 60 |
|
| 61 |
|
|
@@ -73,21 +70,21 @@ def load_model() -> Llama:
|
|
| 73 |
model_path = download_gguf_model()
|
| 74 |
|
| 75 |
print("Loading GGUF model with llama.cpp (CPU optimized)...")
|
| 76 |
-
print("
|
| 77 |
|
| 78 |
-
# CPU optimizations
|
| 79 |
_llama_model = Llama(
|
| 80 |
model_path=model_path,
|
| 81 |
n_ctx=2048, # Context window
|
| 82 |
-
n_threads=
|
| 83 |
-
n_batch=
|
| 84 |
verbose=False, # Quiet mode
|
| 85 |
use_mmap=True, # Memory mapping for faster loading
|
| 86 |
use_mlock=False, # Don't lock memory (HF Spaces constraint)
|
| 87 |
)
|
| 88 |
|
| 89 |
print(f"Model loaded successfully!")
|
| 90 |
-
print(f"Threads:
|
| 91 |
|
| 92 |
gc.collect()
|
| 93 |
return _llama_model
|
|
@@ -110,7 +107,7 @@ def generate_stream(
|
|
| 110 |
temperature=temperature,
|
| 111 |
top_p=0.95,
|
| 112 |
stream=True,
|
| 113 |
-
stop=["</s>", "User:", "Human:", "Assistant:"]
|
| 114 |
)
|
| 115 |
|
| 116 |
for output in stream:
|
|
@@ -136,7 +133,7 @@ def generate(
|
|
| 136 |
max_tokens=max_tokens,
|
| 137 |
temperature=temperature,
|
| 138 |
top_p=0.95,
|
| 139 |
-
stop=["</s>", "User:", "Human:", "Assistant:"]
|
| 140 |
)
|
| 141 |
|
| 142 |
gc.collect()
|
|
|
|
| 7 |
import gc
|
| 8 |
import os
|
| 9 |
from typing import Generator, Optional
|
|
|
|
| 10 |
|
| 11 |
+
from huggingface_hub import hf_hub_download
|
| 12 |
from llama_cpp import Llama
|
| 13 |
|
| 14 |
# Global singleton
|
| 15 |
_llama_model: Optional[Llama] = None
|
| 16 |
|
| 17 |
+
# Model configuration - Mungert/Nanbeige4-3B-Thinking-2511-GGUF
|
| 18 |
+
MODEL_REPO = "Mungert/Nanbeige4-3B-Thinking-2511-GGUF"
|
| 19 |
+
MODEL_FILE = "Nanbeige4-3B-Thinking-2511-iq2_m.gguf" # iq2_m = 2-bit, very fast, good quality
|
|
|
|
|
|
|
|
|
|
| 20 |
CACHE_DIR = "/tmp/models"
|
| 21 |
|
| 22 |
|
|
|
|
| 31 |
# Agar already downloaded hai
|
| 32 |
if os.path.exists(local_path):
|
| 33 |
print(f"GGUF model already exists: {local_path}")
|
| 34 |
+
print(f"Size: {os.path.getsize(local_path) / (1024*1024):.1f} MB")
|
| 35 |
return local_path
|
| 36 |
|
| 37 |
print(f"Downloading GGUF model: {MODEL_FILE}")
|
|
|
|
| 48 |
local_dir_use_symlinks=False
|
| 49 |
)
|
| 50 |
print(f"Model downloaded to: {downloaded_path}")
|
| 51 |
+
print(f"Size: {os.path.getsize(downloaded_path) / (1024*1024):.1f} MB")
|
| 52 |
return downloaded_path
|
| 53 |
|
| 54 |
except Exception as e:
|
| 55 |
print(f"Error downloading GGUF model: {e}")
|
|
|
|
| 56 |
raise
|
| 57 |
|
| 58 |
|
|
|
|
| 70 |
model_path = download_gguf_model()
|
| 71 |
|
| 72 |
print("Loading GGUF model with llama.cpp (CPU optimized)...")
|
| 73 |
+
print("Using iq2_m quantization (2-bit, very fast)")
|
| 74 |
|
| 75 |
+
# CPU optimizations for HF Spaces (2 vCPU, limited RAM)
|
| 76 |
_llama_model = Llama(
|
| 77 |
model_path=model_path,
|
| 78 |
n_ctx=2048, # Context window
|
| 79 |
+
n_threads=2, # HF Spaces free tier has 2 vCPUs
|
| 80 |
+
n_batch=256, # Smaller batch for memory efficiency
|
| 81 |
verbose=False, # Quiet mode
|
| 82 |
use_mmap=True, # Memory mapping for faster loading
|
| 83 |
use_mlock=False, # Don't lock memory (HF Spaces constraint)
|
| 84 |
)
|
| 85 |
|
| 86 |
print(f"Model loaded successfully!")
|
| 87 |
+
print(f"Threads: 2 | Context: 2048 | Quantization: iq2_m (2-bit)")
|
| 88 |
|
| 89 |
gc.collect()
|
| 90 |
return _llama_model
|
|
|
|
| 107 |
temperature=temperature,
|
| 108 |
top_p=0.95,
|
| 109 |
stream=True,
|
| 110 |
+
stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
|
| 111 |
)
|
| 112 |
|
| 113 |
for output in stream:
|
|
|
|
| 133 |
max_tokens=max_tokens,
|
| 134 |
temperature=temperature,
|
| 135 |
top_p=0.95,
|
| 136 |
+
stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
|
| 137 |
)
|
| 138 |
|
| 139 |
gc.collect()
|