Karan6933 commited on
Commit
9c2f3d5
·
verified ·
1 Parent(s): 05944f9

Upload 8 files

Browse files
Files changed (2) hide show
  1. app/main.py +5 -3
  2. app/model.py +13 -16
app/main.py CHANGED
@@ -19,6 +19,7 @@ async def lifespan(app: FastAPI):
19
  """Startup: Download and load model."""
20
  print("=" * 50)
21
  print("Starting up - Loading GGUF model...")
 
22
  print("=" * 50)
23
  load_model() # Pre-load on startup
24
  print("Ready for requests!")
@@ -27,8 +28,8 @@ async def lifespan(app: FastAPI):
27
 
28
 
29
  app = FastAPI(
30
- title="Nanbeige3B-GGUF API",
31
- description="Fast CPU inference with llama.cpp",
32
  version="2.0.0",
33
  lifespan=lifespan
34
  )
@@ -38,9 +39,10 @@ app = FastAPI(
38
  async def health_check():
39
  return {
40
  "status": "ok",
41
- "model": "Nanbeige-3B-GGUF",
42
  "backend": "llama.cpp",
43
  "device": "cpu",
 
44
  "optimized": True
45
  }
46
 
 
19
  """Startup: Download and load model."""
20
  print("=" * 50)
21
  print("Starting up - Loading GGUF model...")
22
+ print("Model: Mungert/Nanbeige4-3B-Thinking-2511-GGUF")
23
  print("=" * 50)
24
  load_model() # Pre-load on startup
25
  print("Ready for requests!")
 
28
 
29
 
30
  app = FastAPI(
31
+ title="Nanbeige4-3B-Thinking-GGUF API",
32
+ description="Fast CPU inference with llama.cpp (iq2_m quantized)",
33
  version="2.0.0",
34
  lifespan=lifespan
35
  )
 
39
  async def health_check():
40
  return {
41
  "status": "ok",
42
+ "model": "Nanbeige4-3B-Thinking-2511-iq2_m",
43
  "backend": "llama.cpp",
44
  "device": "cpu",
45
+ "quantization": "iq2_m",
46
  "optimized": True
47
  }
48
 
app/model.py CHANGED
@@ -7,20 +7,16 @@ Uses llama.cpp for 2-4x faster inference on CPU.
7
  import gc
8
  import os
9
  from typing import Generator, Optional
10
- from pathlib import Path
11
 
12
- from huggingface_hub import hf_hub_download, list_repo_files
13
  from llama_cpp import Llama
14
 
15
  # Global singleton
16
  _llama_model: Optional[Llama] = None
17
 
18
- # Model configuration
19
- MODEL_REPO = "TheBloke/Nanbeige-3B-GGUF" # GGUF version available hai
20
- MODEL_FILE = "nanbeige-3b.Q4_K_M.gguf" # 4-bit quantized, balanced quality/speed
21
- # Agar yeh nahi chale toh: "nanbeige-3b.Q4_0.gguf" (faster, less quality)
22
- # Ya: "nanbeige-3b.Q5_K_M.gguf" (better quality, slower)
23
-
24
  CACHE_DIR = "/tmp/models"
25
 
26
 
@@ -35,6 +31,7 @@ def download_gguf_model() -> str:
35
  # Agar already downloaded hai
36
  if os.path.exists(local_path):
37
  print(f"GGUF model already exists: {local_path}")
 
38
  return local_path
39
 
40
  print(f"Downloading GGUF model: {MODEL_FILE}")
@@ -51,11 +48,11 @@ def download_gguf_model() -> str:
51
  local_dir_use_symlinks=False
52
  )
53
  print(f"Model downloaded to: {downloaded_path}")
 
54
  return downloaded_path
55
 
56
  except Exception as e:
57
  print(f"Error downloading GGUF model: {e}")
58
- print("Falling back to smaller model or available alternative...")
59
  raise
60
 
61
 
@@ -73,21 +70,21 @@ def load_model() -> Llama:
73
  model_path = download_gguf_model()
74
 
75
  print("Loading GGUF model with llama.cpp (CPU optimized)...")
76
- print("This is 2-4x faster than transformers!")
77
 
78
- # CPU optimizations
79
  _llama_model = Llama(
80
  model_path=model_path,
81
  n_ctx=2048, # Context window
82
- n_threads=4, # CPU threads (tune based on your CPU)
83
- n_batch=512, # Batch size for prompt processing
84
  verbose=False, # Quiet mode
85
  use_mmap=True, # Memory mapping for faster loading
86
  use_mlock=False, # Don't lock memory (HF Spaces constraint)
87
  )
88
 
89
  print(f"Model loaded successfully!")
90
- print(f"Threads: 4 | Context: 2048 | Quantization: Q4_K_M")
91
 
92
  gc.collect()
93
  return _llama_model
@@ -110,7 +107,7 @@ def generate_stream(
110
  temperature=temperature,
111
  top_p=0.95,
112
  stream=True,
113
- stop=["</s>", "User:", "Human:", "Assistant:"]
114
  )
115
 
116
  for output in stream:
@@ -136,7 +133,7 @@ def generate(
136
  max_tokens=max_tokens,
137
  temperature=temperature,
138
  top_p=0.95,
139
- stop=["</s>", "User:", "Human:", "Assistant:"]
140
  )
141
 
142
  gc.collect()
 
7
  import gc
8
  import os
9
  from typing import Generator, Optional
 
10
 
11
+ from huggingface_hub import hf_hub_download
12
  from llama_cpp import Llama
13
 
14
  # Global singleton
15
  _llama_model: Optional[Llama] = None
16
 
17
+ # Model configuration - Mungert/Nanbeige4-3B-Thinking-2511-GGUF
18
+ MODEL_REPO = "Mungert/Nanbeige4-3B-Thinking-2511-GGUF"
19
+ MODEL_FILE = "Nanbeige4-3B-Thinking-2511-iq2_m.gguf" # iq2_m = 2-bit, very fast, good quality
 
 
 
20
  CACHE_DIR = "/tmp/models"
21
 
22
 
 
31
  # Agar already downloaded hai
32
  if os.path.exists(local_path):
33
  print(f"GGUF model already exists: {local_path}")
34
+ print(f"Size: {os.path.getsize(local_path) / (1024*1024):.1f} MB")
35
  return local_path
36
 
37
  print(f"Downloading GGUF model: {MODEL_FILE}")
 
48
  local_dir_use_symlinks=False
49
  )
50
  print(f"Model downloaded to: {downloaded_path}")
51
+ print(f"Size: {os.path.getsize(downloaded_path) / (1024*1024):.1f} MB")
52
  return downloaded_path
53
 
54
  except Exception as e:
55
  print(f"Error downloading GGUF model: {e}")
 
56
  raise
57
 
58
 
 
70
  model_path = download_gguf_model()
71
 
72
  print("Loading GGUF model with llama.cpp (CPU optimized)...")
73
+ print("Using iq2_m quantization (2-bit, very fast)")
74
 
75
+ # CPU optimizations for HF Spaces (2 vCPU, limited RAM)
76
  _llama_model = Llama(
77
  model_path=model_path,
78
  n_ctx=2048, # Context window
79
+ n_threads=2, # HF Spaces free tier has 2 vCPUs
80
+ n_batch=256, # Smaller batch for memory efficiency
81
  verbose=False, # Quiet mode
82
  use_mmap=True, # Memory mapping for faster loading
83
  use_mlock=False, # Don't lock memory (HF Spaces constraint)
84
  )
85
 
86
  print(f"Model loaded successfully!")
87
+ print(f"Threads: 2 | Context: 2048 | Quantization: iq2_m (2-bit)")
88
 
89
  gc.collect()
90
  return _llama_model
 
107
  temperature=temperature,
108
  top_p=0.95,
109
  stream=True,
110
+ stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
111
  )
112
 
113
  for output in stream:
 
133
  max_tokens=max_tokens,
134
  temperature=temperature,
135
  top_p=0.95,
136
+ stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
137
  )
138
 
139
  gc.collect()