Spaces:

Karan6933
/

coder-agent

Sleeping

App Files Files Community

Karan6933 commited on 7 days ago

Commit

9c2f3d5

verified ·

1 Parent(s): 05944f9

Upload 8 files

Browse files

Files changed (2) hide show

app/main.py +5 -3
app/model.py +13 -16

app/main.py CHANGED Viewed

@@ -19,6 +19,7 @@ async def lifespan(app: FastAPI):
     """Startup: Download and load model."""
     print("=" * 50)
     print("Starting up - Loading GGUF model...")
     print("=" * 50)
     load_model()  # Pre-load on startup
     print("Ready for requests!")
@@ -27,8 +28,8 @@ async def lifespan(app: FastAPI):
 app = FastAPI(
-    title="Nanbeige3B-GGUF API",
-    description="Fast CPU inference with llama.cpp",
     version="2.0.0",
     lifespan=lifespan
 )
@@ -38,9 +39,10 @@ app = FastAPI(
 async def health_check():
     return {
         "status": "ok",
-        "model": "Nanbeige-3B-GGUF",
         "backend": "llama.cpp",
         "device": "cpu",
         "optimized": True
     }

     """Startup: Download and load model."""
     print("=" * 50)
     print("Starting up - Loading GGUF model...")
+    print("Model: Mungert/Nanbeige4-3B-Thinking-2511-GGUF")
     print("=" * 50)
     load_model()  # Pre-load on startup
     print("Ready for requests!")
 app = FastAPI(
+    title="Nanbeige4-3B-Thinking-GGUF API",
+    description="Fast CPU inference with llama.cpp (iq2_m quantized)",
     version="2.0.0",
     lifespan=lifespan
 )
 async def health_check():
     return {
         "status": "ok",
+        "model": "Nanbeige4-3B-Thinking-2511-iq2_m",
         "backend": "llama.cpp",
         "device": "cpu",
+        "quantization": "iq2_m",
         "optimized": True
     }

app/model.py CHANGED Viewed

@@ -7,20 +7,16 @@ Uses llama.cpp for 2-4x faster inference on CPU.
 import gc
 import os
 from typing import Generator, Optional
-from pathlib import Path
-from huggingface_hub import hf_hub_download, list_repo_files
 from llama_cpp import Llama
 # Global singleton
 _llama_model: Optional[Llama] = None
-# Model configuration
-MODEL_REPO = "TheBloke/Nanbeige-3B-GGUF"  # GGUF version available hai
-MODEL_FILE = "nanbeige-3b.Q4_K_M.gguf"    # 4-bit quantized, balanced quality/speed
-# Agar yeh nahi chale toh: "nanbeige-3b.Q4_0.gguf" (faster, less quality)
-# Ya: "nanbeige-3b.Q5_K_M.gguf" (better quality, slower)
 CACHE_DIR = "/tmp/models"
@@ -35,6 +31,7 @@ def download_gguf_model() -> str:
     # Agar already downloaded hai
     if os.path.exists(local_path):
         print(f"GGUF model already exists: {local_path}")
         return local_path
     print(f"Downloading GGUF model: {MODEL_FILE}")
@@ -51,11 +48,11 @@ def download_gguf_model() -> str:
             local_dir_use_symlinks=False
         )
         print(f"Model downloaded to: {downloaded_path}")
         return downloaded_path
     except Exception as e:
         print(f"Error downloading GGUF model: {e}")
-        print("Falling back to smaller model or available alternative...")
         raise
@@ -73,21 +70,21 @@ def load_model() -> Llama:
     model_path = download_gguf_model()
     print("Loading GGUF model with llama.cpp (CPU optimized)...")
-    print("This is 2-4x faster than transformers!")
-    # CPU optimizations
     _llama_model = Llama(
         model_path=model_path,
         n_ctx=2048,        # Context window
-        n_threads=4,       # CPU threads (tune based on your CPU)
-        n_batch=512,       # Batch size for prompt processing
         verbose=False,     # Quiet mode
         use_mmap=True,     # Memory mapping for faster loading
         use_mlock=False,   # Don't lock memory (HF Spaces constraint)
     )
     print(f"Model loaded successfully!")
-    print(f"Threads: 4 | Context: 2048 | Quantization: Q4_K_M")
     gc.collect()
     return _llama_model
@@ -110,7 +107,7 @@ def generate_stream(
         temperature=temperature,
         top_p=0.95,
         stream=True,
-        stop=["</s>", "User:", "Human:", "Assistant:"]
     )
     for output in stream:
@@ -136,7 +133,7 @@ def generate(
         max_tokens=max_tokens,
         temperature=temperature,
         top_p=0.95,
-        stop=["</s>", "User:", "Human:", "Assistant:"]
     )
     gc.collect()

 import gc
 import os
 from typing import Generator, Optional
+from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 # Global singleton
 _llama_model: Optional[Llama] = None
+# Model configuration - Mungert/Nanbeige4-3B-Thinking-2511-GGUF
+MODEL_REPO = "Mungert/Nanbeige4-3B-Thinking-2511-GGUF"
+MODEL_FILE = "Nanbeige4-3B-Thinking-2511-iq2_m.gguf"  # iq2_m = 2-bit, very fast, good quality
 CACHE_DIR = "/tmp/models"
     # Agar already downloaded hai
     if os.path.exists(local_path):
         print(f"GGUF model already exists: {local_path}")
+        print(f"Size: {os.path.getsize(local_path) / (1024*1024):.1f} MB")
         return local_path
     print(f"Downloading GGUF model: {MODEL_FILE}")
             local_dir_use_symlinks=False
         )
         print(f"Model downloaded to: {downloaded_path}")
+        print(f"Size: {os.path.getsize(downloaded_path) / (1024*1024):.1f} MB")
         return downloaded_path
     except Exception as e:
         print(f"Error downloading GGUF model: {e}")
         raise
     model_path = download_gguf_model()
     print("Loading GGUF model with llama.cpp (CPU optimized)...")
+    print("Using iq2_m quantization (2-bit, very fast)")
+    # CPU optimizations for HF Spaces (2 vCPU, limited RAM)
     _llama_model = Llama(
         model_path=model_path,
         n_ctx=2048,        # Context window
+        n_threads=2,       # HF Spaces free tier has 2 vCPUs
+        n_batch=256,       # Smaller batch for memory efficiency
         verbose=False,     # Quiet mode
         use_mmap=True,     # Memory mapping for faster loading
         use_mlock=False,   # Don't lock memory (HF Spaces constraint)
     )
     print(f"Model loaded successfully!")
+    print(f"Threads: 2 | Context: 2048 | Quantization: iq2_m (2-bit)")
     gc.collect()
     return _llama_model
         temperature=temperature,
         top_p=0.95,
         stream=True,
+        stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
     )
     for output in stream:
         max_tokens=max_tokens,
         temperature=temperature,
         top_p=0.95,
+        stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
     )
     gc.collect()