Karan6933 commited on
Commit
05944f9
·
verified ·
1 Parent(s): 3809b3c

Upload 8 files

Browse files
Files changed (6) hide show
  1. Dockerfile +5 -5
  2. app/main.py +25 -51
  3. app/model.py +105 -158
  4. app/prompt.py +2 -10
  5. app/schemas.py +6 -30
  6. requirements.txt +2 -4
Dockerfile CHANGED
@@ -1,25 +1,25 @@
 
1
  FROM python:3.11-slim
2
 
3
  ENV PYTHONUNBUFFERED=1 \
4
- CMAKE_ARGS="-DLLAMA_AVX2=ON" \
5
  FORCE_CMAKE=1
6
 
7
- # System deps (llama.cpp ke liye zaroori)
8
  RUN apt-get update && apt-get install -y --no-install-recommends \
9
  build-essential \
10
  cmake \
11
  git \
 
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
  WORKDIR /app
15
 
16
- # Install python deps (IMPORTANT)
17
  COPY requirements.txt .
18
  RUN pip install --no-cache-dir -r requirements.txt
19
 
20
- # Copy app
21
  COPY app/ ./app/
22
 
23
  EXPOSE 7860
24
 
25
- CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Dockerfile
2
  FROM python:3.11-slim
3
 
4
  ENV PYTHONUNBUFFERED=1 \
5
+ CMAKE_ARGS="-DLLAMA_AVX2=ON -DLLAMA_AVX=ON -DLLAMA_FMA=ON" \
6
  FORCE_CMAKE=1
7
 
8
+ # System deps for llama.cpp compilation
9
  RUN apt-get update && apt-get install -y --no-install-recommends \
10
  build-essential \
11
  cmake \
12
  git \
13
+ wget \
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
  WORKDIR /app
17
 
 
18
  COPY requirements.txt .
19
  RUN pip install --no-cache-dir -r requirements.txt
20
 
 
21
  COPY app/ ./app/
22
 
23
  EXPOSE 7860
24
 
25
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
app/main.py CHANGED
@@ -1,14 +1,13 @@
1
  # app/main.py
2
  """
3
- FastAPI application for serving Nanbeige4.1-3B model.
4
- CPU-ONLY optimized for Hugging Face Spaces (Docker).
5
  """
6
 
7
  import asyncio
8
  from contextlib import asynccontextmanager
9
 
10
  from fastapi import FastAPI
11
- from fastapi.responses import StreamingResponse, JSONResponse
12
 
13
  from app.model import load_model, generate_stream, generate
14
  from app.prompt import build_prompt
@@ -17,89 +16,64 @@ from app.schemas import GenerationRequest, GenerationResponse
17
 
18
  @asynccontextmanager
19
  async def lifespan(app: FastAPI):
20
- """
21
- Lifespan context manager for startup/shutdown events.
22
- Loads model on startup to ensure it's ready for requests.
23
- """
24
- # Startup: Load model
25
- print("Loading model on CPU...")
26
- load_model()
27
- print("Model loaded successfully on CPU")
28
  yield
29
- # Shutdown: Cleanup
30
  print("Shutting down...")
31
 
32
 
33
  app = FastAPI(
34
- title="Nanbeige4.1-3B API (CPU)",
35
- description="FastAPI wrapper for Nanbeige4.1-3B - CPU Optimized",
36
- version="1.0.0",
37
  lifespan=lifespan
38
  )
39
 
40
 
41
  @app.get("/")
42
  async def health_check():
43
- """Health check endpoint."""
44
  return {
45
- "status": "ok",
46
- "model": "Nanbeige4.1-3B",
 
47
  "device": "cpu",
48
- "mode": "float32"
49
  }
50
 
51
 
52
  @app.post("/generate")
53
  async def generate_text(request: GenerationRequest):
54
- """
55
- Generate text from prompt.
56
- Supports both streaming and non-streaming responses.
57
- """
58
- # Build final prompt with system instructions
59
  final_prompt = build_prompt(request.prompt)
60
 
61
  if request.stream:
62
- # Streaming response
63
  async def stream_generator():
64
- # Run sync generator in thread pool to not block event loop
65
  loop = asyncio.get_event_loop()
66
 
67
- # Use run_in_executor for CPU-bound operations
68
- def sync_generator():
69
- return generate_stream(
70
  final_prompt,
71
  temperature=request.temperature,
72
  max_tokens=request.max_tokens
73
- )
 
74
 
75
- # Get the generator
76
- sync_gen = await loop.run_in_executor(None, sync_generator)
77
-
78
- # Iterate through chunks
79
- for chunk in sync_gen:
80
  if chunk:
81
- # SSE format
82
  yield f"data: {chunk}\n\n"
83
-
84
  yield "data: [DONE]\n\n"
85
 
86
  return StreamingResponse(
87
  stream_generator(),
88
- media_type="text/event-stream",
89
- headers={
90
- "Cache-Control": "no-cache",
91
- "Connection": "keep-alive",
92
- }
93
  )
94
  else:
95
- # Non-streaming response - run in executor to not block
96
- loop = asyncio.get_event_loop()
97
- result = await loop.run_in_executor(
98
- None,
99
- lambda: generate(
100
- final_prompt,
101
- temperature=request.temperature,
102
- max_tokens=request.max_tokens
103
- )
104
  )
105
  return GenerationResponse(text=result)
 
1
  # app/main.py
2
  """
3
+ FastAPI app with llama.cpp backend.
 
4
  """
5
 
6
  import asyncio
7
  from contextlib import asynccontextmanager
8
 
9
  from fastapi import FastAPI
10
+ from fastapi.responses import StreamingResponse
11
 
12
  from app.model import load_model, generate_stream, generate
13
  from app.prompt import build_prompt
 
16
 
17
  @asynccontextmanager
18
  async def lifespan(app: FastAPI):
19
+ """Startup: Download and load model."""
20
+ print("=" * 50)
21
+ print("Starting up - Loading GGUF model...")
22
+ print("=" * 50)
23
+ load_model() # Pre-load on startup
24
+ print("Ready for requests!")
 
 
25
  yield
 
26
  print("Shutting down...")
27
 
28
 
29
  app = FastAPI(
30
+ title="Nanbeige3B-GGUF API",
31
+ description="Fast CPU inference with llama.cpp",
32
+ version="2.0.0",
33
  lifespan=lifespan
34
  )
35
 
36
 
37
  @app.get("/")
38
  async def health_check():
 
39
  return {
40
+ "status": "ok",
41
+ "model": "Nanbeige-3B-GGUF",
42
+ "backend": "llama.cpp",
43
  "device": "cpu",
44
+ "optimized": True
45
  }
46
 
47
 
48
  @app.post("/generate")
49
  async def generate_text(request: GenerationRequest):
 
 
 
 
 
50
  final_prompt = build_prompt(request.prompt)
51
 
52
  if request.stream:
 
53
  async def stream_generator():
 
54
  loop = asyncio.get_event_loop()
55
 
56
+ def sync_gen():
57
+ for chunk in generate_stream(
 
58
  final_prompt,
59
  temperature=request.temperature,
60
  max_tokens=request.max_tokens
61
+ ):
62
+ yield chunk
63
 
64
+ for chunk in sync_gen():
 
 
 
 
65
  if chunk:
 
66
  yield f"data: {chunk}\n\n"
 
67
  yield "data: [DONE]\n\n"
68
 
69
  return StreamingResponse(
70
  stream_generator(),
71
+ media_type="text/event-stream"
 
 
 
 
72
  )
73
  else:
74
+ result = generate(
75
+ final_prompt,
76
+ temperature=request.temperature,
77
+ max_tokens=request.max_tokens
 
 
 
 
 
78
  )
79
  return GenerationResponse(text=result)
app/model.py CHANGED
@@ -1,7 +1,7 @@
1
- # app/model.py - llama.cpp optimized version
2
  """
3
- CPU-optimized model loading using llama-cpp-python.
4
- 2-4x faster than transformers on CPU.
5
  """
6
 
7
  import gc
@@ -9,188 +9,135 @@ import os
9
  from typing import Generator, Optional
10
  from pathlib import Path
11
 
12
- # Try to use llama.cpp, fallback to transformers
13
- try:
14
- from llama_cpp import Llama
15
- LLAMA_AVAILABLE = True
16
- except ImportError:
17
- LLAMA_AVAILABLE = False
18
- from transformers import AutoModelForCausalLM, AutoTokenizer
19
 
20
  # Global singleton
21
- _llama_model = None
22
- _transformer_model = None
23
- _tokenizer = None
24
 
 
 
 
 
 
25
 
26
- def get_model_path() -> str:
 
 
 
27
  """
28
- Returns path to GGUF model.
29
- If GGUF not available, returns HF model name.
30
  """
31
- # Pehle check karo agar GGUF downloaded hai
32
- gguf_path = "/tmp/models/nanbeige-3b-q4_0.gguf"
33
- if os.path.exists(gguf_path):
34
- return gguf_path
 
 
 
 
 
 
 
35
 
36
- # Agar nahi hai, toh HF model name return karo
37
- return "Nanbeige/Nanbeige4.1-3B"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
- def load_model():
41
  """
42
- Load model with llama.cpp if available (GGUF),
43
- otherwise fallback to optimized transformers.
44
  """
45
- global _llama_model, _transformer_model, _tokenizer
46
 
47
- # Agar already loaded hai
48
- if _llama_model or _transformer_model:
49
- return
50
 
51
- model_path = get_model_path()
 
52
 
53
- # GGUF format mein hai toh llama.cpp use karo (FAST)
54
- if model_path.endswith(".gguf") and LLAMA_AVAILABLE:
55
- print("Loading GGUF model with llama.cpp (optimized)...")
56
- _llama_model = Llama(
57
- model_path=model_path,
58
- n_ctx=2048,
59
- n_threads=4, # CPU threads
60
- n_batch=512,
61
- verbose=False
62
- )
63
- print("Model loaded with llama.cpp")
64
 
65
- # Nahi toh transformers fallback (SLOW but works)
66
- else:
67
- print("GGUF not available, using transformers (slower)...")
68
- import torch
69
- from transformers import AutoModelForCausalLM, AutoTokenizer
70
-
71
- model_name = "Nanbeige/Nanbeige4.1-3B"
72
-
73
- _tokenizer = AutoTokenizer.from_pretrained(
74
- model_name,
75
- trust_remote_code=True,
76
- use_fast=False
77
- )
78
-
79
- if _tokenizer.pad_token is None:
80
- _tokenizer.pad_token = _tokenizer.eos_token
81
-
82
- _transformer_model = AutoModelForCausalLM.from_pretrained(
83
- model_name,
84
- torch_dtype=torch.float32,
85
- trust_remote_code=True,
86
- low_cpu_mem_usage=True,
87
- device_map=None,
88
- )
89
- _transformer_model = _transformer_model.to("cpu")
90
- _transformer_model.eval()
91
-
92
- # Disable gradients
93
- for param in _transformer_model.parameters():
94
- param.requires_grad = False
95
-
96
- print("Model loaded with transformers")
97
 
98
  gc.collect()
 
99
 
100
 
101
- def generate_stream(prompt: str, temperature: float = 0.7, max_tokens: int = 100):
 
 
 
 
102
  """
103
- Generate with llama.cpp (fast) or transformers (slow).
104
  """
105
- load_model()
106
 
107
- # llama.cpp path (FAST - 2-4x speedup)
108
- if _llama_model:
109
- # llama.cpp native streaming
110
- stream = _llama_model(
111
- prompt,
112
- max_tokens=max_tokens,
113
- temperature=temperature,
114
- top_p=0.95,
115
- stream=True,
116
- stop=["</s>", "User:", "Human:"]
117
- )
118
-
119
- for output in stream:
120
- text = output["choices"][0]["text"]
121
- if text:
122
- yield text
123
 
124
- # Transformers fallback (SLOW)
125
- else:
126
- import torch
127
- from threading import Thread
128
- from transformers import TextIteratorStreamer
129
-
130
- inputs = _tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
131
- input_ids = inputs.input_ids
132
-
133
- streamer = TextIteratorStreamer(
134
- _tokenizer,
135
- skip_prompt=True,
136
- skip_special_tokens=True
137
- )
138
-
139
- generation_kwargs = {
140
- "input_ids": input_ids,
141
- "max_new_tokens": max_tokens,
142
- "temperature": temperature,
143
- "top_p": 0.95,
144
- "do_sample": True,
145
- "pad_token_id": _tokenizer.pad_token_id,
146
- "eos_token_id": _tokenizer.eos_token_id,
147
- "streamer": streamer,
148
- "use_cache": True,
149
- }
150
-
151
- thread = Thread(target=_transformer_model.generate, kwargs=generation_kwargs)
152
- thread.start()
153
-
154
- for text in streamer:
155
- if text:
156
- yield text
157
-
158
- thread.join()
159
-
160
  gc.collect()
161
 
162
 
163
- def generate(prompt: str, temperature: float = 0.7, max_tokens: int = 100) -> str:
 
 
 
 
164
  """
165
- Non-streaming generation.
166
  """
167
- load_model()
168
 
169
- if _llama_model:
170
- output = _llama_model(
171
- prompt,
172
- max_tokens=max_tokens,
173
- temperature=temperature,
174
- top_p=0.95,
175
- stop=["</s>", "User:", "Human:"]
176
- )
177
- return output["choices"][0]["text"]
178
 
179
- else:
180
- import torch
181
- inputs = _tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
182
-
183
- with torch.no_grad():
184
- output_ids = _transformer_model.generate(
185
- inputs.input_ids,
186
- max_new_tokens=max_tokens,
187
- temperature=temperature,
188
- top_p=0.95,
189
- do_sample=True,
190
- pad_token_id=_tokenizer.pad_token_id,
191
- eos_token_id=_tokenizer.eos_token_id,
192
- use_cache=True,
193
- )
194
-
195
- new_tokens = output_ids[0][len(inputs.input_ids[0]):]
196
- return _tokenizer.decode(new_tokens, skip_special_tokens=True)
 
1
+ # app/model.py
2
  """
3
+ CPU-optimized model loading with automatic GGUF download.
4
+ Uses llama.cpp for 2-4x faster inference on CPU.
5
  """
6
 
7
  import gc
 
9
  from typing import Generator, Optional
10
  from pathlib import Path
11
 
12
+ from huggingface_hub import hf_hub_download, list_repo_files
13
+ from llama_cpp import Llama
 
 
 
 
 
14
 
15
  # Global singleton
16
+ _llama_model: Optional[Llama] = None
 
 
17
 
18
+ # Model configuration
19
+ MODEL_REPO = "TheBloke/Nanbeige-3B-GGUF" # GGUF version available hai
20
+ MODEL_FILE = "nanbeige-3b.Q4_K_M.gguf" # 4-bit quantized, balanced quality/speed
21
+ # Agar yeh nahi chale toh: "nanbeige-3b.Q4_0.gguf" (faster, less quality)
22
+ # Ya: "nanbeige-3b.Q5_K_M.gguf" (better quality, slower)
23
 
24
+ CACHE_DIR = "/tmp/models"
25
+
26
+
27
+ def download_gguf_model() -> str:
28
  """
29
+ Download GGUF model from Hugging Face if not exists.
30
+ Returns local path to model file.
31
  """
32
+ os.makedirs(CACHE_DIR, exist_ok=True)
33
+ local_path = os.path.join(CACHE_DIR, MODEL_FILE)
34
+
35
+ # Agar already downloaded hai
36
+ if os.path.exists(local_path):
37
+ print(f"GGUF model already exists: {local_path}")
38
+ return local_path
39
+
40
+ print(f"Downloading GGUF model: {MODEL_FILE}")
41
+ print(f"From: {MODEL_REPO}")
42
+ print("This may take a few minutes...")
43
 
44
+ try:
45
+ # Download from Hugging Face
46
+ downloaded_path = hf_hub_download(
47
+ repo_id=MODEL_REPO,
48
+ filename=MODEL_FILE,
49
+ cache_dir=CACHE_DIR,
50
+ local_dir=CACHE_DIR,
51
+ local_dir_use_symlinks=False
52
+ )
53
+ print(f"Model downloaded to: {downloaded_path}")
54
+ return downloaded_path
55
+
56
+ except Exception as e:
57
+ print(f"Error downloading GGUF model: {e}")
58
+ print("Falling back to smaller model or available alternative...")
59
+ raise
60
 
61
 
62
+ def load_model() -> Llama:
63
  """
64
+ Load GGUF model with llama.cpp (optimized for CPU).
65
+ Downloads automatically if not present.
66
  """
67
+ global _llama_model
68
 
69
+ if _llama_model is not None:
70
+ return _llama_model
 
71
 
72
+ # Download if needed
73
+ model_path = download_gguf_model()
74
 
75
+ print("Loading GGUF model with llama.cpp (CPU optimized)...")
76
+ print("This is 2-4x faster than transformers!")
 
 
 
 
 
 
 
 
 
77
 
78
+ # CPU optimizations
79
+ _llama_model = Llama(
80
+ model_path=model_path,
81
+ n_ctx=2048, # Context window
82
+ n_threads=4, # CPU threads (tune based on your CPU)
83
+ n_batch=512, # Batch size for prompt processing
84
+ verbose=False, # Quiet mode
85
+ use_mmap=True, # Memory mapping for faster loading
86
+ use_mlock=False, # Don't lock memory (HF Spaces constraint)
87
+ )
88
+
89
+ print(f"Model loaded successfully!")
90
+ print(f"Threads: 4 | Context: 2048 | Quantization: Q4_K_M")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  gc.collect()
93
+ return _llama_model
94
 
95
 
96
+ def generate_stream(
97
+ prompt: str,
98
+ temperature: float = 0.7,
99
+ max_tokens: int = 200
100
+ ) -> Generator[str, None, None]:
101
  """
102
+ Streaming generation with llama.cpp (FAST).
103
  """
104
+ model = load_model()
105
 
106
+ # llama.cpp native streaming - very fast on CPU
107
+ stream = model(
108
+ prompt,
109
+ max_tokens=max_tokens,
110
+ temperature=temperature,
111
+ top_p=0.95,
112
+ stream=True,
113
+ stop=["</s>", "User:", "Human:", "Assistant:"]
114
+ )
115
+
116
+ for output in stream:
117
+ text = output["choices"][0]["text"]
118
+ if text:
119
+ yield text
 
 
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  gc.collect()
122
 
123
 
124
+ def generate(
125
+ prompt: str,
126
+ temperature: float = 0.7,
127
+ max_tokens: int = 200
128
+ ) -> str:
129
  """
130
+ Non-streaming generation with llama.cpp.
131
  """
132
+ model = load_model()
133
 
134
+ output = model(
135
+ prompt,
136
+ max_tokens=max_tokens,
137
+ temperature=temperature,
138
+ top_p=0.95,
139
+ stop=["</s>", "User:", "Human:", "Assistant:"]
140
+ )
 
 
141
 
142
+ gc.collect()
143
+ return output["choices"][0]["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/prompt.py CHANGED
@@ -1,6 +1,6 @@
1
  # app/prompt.py
2
  """
3
- Prompt building utilities for Nanbeige model.
4
  """
5
 
6
  SYSTEM_PROMPT = """Tu ek helpful assistant hai. Hamesha concise aur accurate jawab de.
@@ -11,13 +11,5 @@ SYSTEM_PROMPT = """Tu ek helpful assistant hai. Hamesha concise aur accurate jaw
11
 
12
 
13
  def build_prompt(user_input: str) -> str:
14
- """
15
- Build the final prompt by combining system prompt with user input.
16
-
17
- Args:
18
- user_input: Raw user query/input
19
-
20
- Returns:
21
- Formatted prompt string ready for model inference
22
- """
23
  return f"{SYSTEM_PROMPT}\n\nUser: {user_input}\nAssistant:"
 
1
  # app/prompt.py
2
  """
3
+ Prompt building utilities.
4
  """
5
 
6
  SYSTEM_PROMPT = """Tu ek helpful assistant hai. Hamesha concise aur accurate jawab de.
 
11
 
12
 
13
  def build_prompt(user_input: str) -> str:
14
+ """Build final prompt with system instructions."""
 
 
 
 
 
 
 
 
15
  return f"{SYSTEM_PROMPT}\n\nUser: {user_input}\nAssistant:"
app/schemas.py CHANGED
@@ -1,41 +1,17 @@
1
  # app/schemas.py
2
  """
3
- Pydantic schemas for API request/response validation.
4
  """
5
 
6
  from pydantic import BaseModel, Field
7
 
8
 
9
  class GenerationRequest(BaseModel):
10
- """Request schema for text generation endpoint."""
11
-
12
- prompt: str = Field(
13
- ...,
14
- min_length=1,
15
- description="Input prompt text"
16
- )
17
- temperature: float = Field(
18
- default=0.7,
19
- ge=0.0,
20
- le=2.0,
21
- description="Sampling temperature"
22
- )
23
- max_tokens: int = Field(
24
- default=200,
25
- ge=1,
26
- le=512,
27
- description="Maximum tokens to generate"
28
- )
29
- stream: bool = Field(
30
- default=True,
31
- description="Whether to stream the response"
32
- )
33
 
34
 
35
  class GenerationResponse(BaseModel):
36
- """Response schema for non-streaming generation."""
37
-
38
- text: str = Field(
39
- ...,
40
- description="Generated text response"
41
- )
 
1
  # app/schemas.py
2
  """
3
+ Pydantic schemas.
4
  """
5
 
6
  from pydantic import BaseModel, Field
7
 
8
 
9
  class GenerationRequest(BaseModel):
10
+ prompt: str = Field(..., min_length=1)
11
+ temperature: float = Field(default=0.7, ge=0.0, le=2.0)
12
+ max_tokens: int = Field(default=200, ge=1, le=1024)
13
+ stream: bool = Field(default=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  class GenerationResponse(BaseModel):
17
+ text: str
 
 
 
 
 
requirements.txt CHANGED
@@ -2,9 +2,7 @@
2
  fastapi==0.115.0
3
  uvicorn[standard]==0.32.0
4
  pydantic==2.9.0
5
- transformers==4.46.0
6
- torch==2.5.0
7
- accelerate==1.0.0
8
- sentencepiece==0.2.0
9
  huggingface-hub==0.26.0
 
10
  python-multipart==0.0.12
 
2
  fastapi==0.115.0
3
  uvicorn[standard]==0.32.0
4
  pydantic==2.9.0
5
+ llama-cpp-python==0.3.2
 
 
 
6
  huggingface-hub==0.26.0
7
+ requests==2.32.0
8
  python-multipart==0.0.12