Karan6933 commited on
Commit
86a78e2
·
verified ·
1 Parent(s): 538c943

Upload 7 files

Browse files
Files changed (6) hide show
  1. Dockerfile +3 -1
  2. app/main.py +34 -18
  3. app/model.py +41 -56
  4. app/schemas.py +1 -1
  5. requirements.txt +0 -1
  6. run.sh +5 -0
Dockerfile CHANGED
@@ -6,7 +6,9 @@ ENV PYTHONUNBUFFERED=1 \
6
  PYTHONDONTWRITEBYTECODE=1 \
7
  HF_HOME=/tmp/.huggingface \
8
  TRANSFORMERS_CACHE=/tmp/.cache/huggingface \
9
- HF_HUB_CACHE=/tmp/.cache/huggingface/hub
 
 
10
 
11
  # Install minimal system dependencies
12
  RUN apt-get update && apt-get install -y --no-install-recommends \
 
6
  PYTHONDONTWRITEBYTECODE=1 \
7
  HF_HOME=/tmp/.huggingface \
8
  TRANSFORMERS_CACHE=/tmp/.cache/huggingface \
9
+ HF_HUB_CACHE=/tmp/.cache/huggingface/hub \
10
+ OMP_NUM_THREADS=4 \
11
+ MKL_NUM_THREADS=4
12
 
13
  # Install minimal system dependencies
14
  RUN apt-get update && apt-get install -y --no-install-recommends \
app/main.py CHANGED
@@ -1,7 +1,7 @@
1
  # app/main.py
2
  """
3
  FastAPI application for serving Nanbeige4.1-3B model.
4
- Optimized for Hugging Face Spaces (CPU, Docker).
5
  """
6
 
7
  import asyncio
@@ -22,17 +22,17 @@ async def lifespan(app: FastAPI):
22
  Loads model on startup to ensure it's ready for requests.
23
  """
24
  # Startup: Load model
25
- print("Loading model...")
26
  load_model()
27
- print("Model loaded successfully")
28
  yield
29
- # Shutdown: Cleanup (if needed)
30
  print("Shutting down...")
31
 
32
 
33
  app = FastAPI(
34
- title="Nanbeige4.1-3B API",
35
- description="FastAPI wrapper for Nanbeige4.1-3B with streaming support",
36
  version="1.0.0",
37
  lifespan=lifespan
38
  )
@@ -41,7 +41,12 @@ app = FastAPI(
41
  @app.get("/")
42
  async def health_check():
43
  """Health check endpoint."""
44
- return {"status": "ok", "model": "Nanbeige4.1-3B"}
 
 
 
 
 
45
 
46
 
47
  @app.post("/generate")
@@ -56,14 +61,21 @@ async def generate_text(request: GenerationRequest):
56
  if request.stream:
57
  # Streaming response
58
  async def stream_generator():
59
- # Run sync generator in thread pool to not block
60
  loop = asyncio.get_event_loop()
61
- sync_gen = generate_stream(
62
- final_prompt,
63
- temperature=request.temperature,
64
- max_tokens=request.max_tokens
65
- )
66
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  for chunk in sync_gen:
68
  if chunk:
69
  # SSE format
@@ -80,10 +92,14 @@ async def generate_text(request: GenerationRequest):
80
  }
81
  )
82
  else:
83
- # Non-streaming response
84
- result = generate(
85
- final_prompt,
86
- temperature=request.temperature,
87
- max_tokens=request.max_tokens
 
 
 
 
88
  )
89
  return GenerationResponse(text=result)
 
1
  # app/main.py
2
  """
3
  FastAPI application for serving Nanbeige4.1-3B model.
4
+ CPU-ONLY optimized for Hugging Face Spaces (Docker).
5
  """
6
 
7
  import asyncio
 
22
  Loads model on startup to ensure it's ready for requests.
23
  """
24
  # Startup: Load model
25
+ print("Loading model on CPU...")
26
  load_model()
27
+ print("Model loaded successfully on CPU")
28
  yield
29
+ # Shutdown: Cleanup
30
  print("Shutting down...")
31
 
32
 
33
  app = FastAPI(
34
+ title="Nanbeige4.1-3B API (CPU)",
35
+ description="FastAPI wrapper for Nanbeige4.1-3B - CPU Optimized",
36
  version="1.0.0",
37
  lifespan=lifespan
38
  )
 
41
  @app.get("/")
42
  async def health_check():
43
  """Health check endpoint."""
44
+ return {
45
+ "status": "ok",
46
+ "model": "Nanbeige4.1-3B",
47
+ "device": "cpu",
48
+ "mode": "float32"
49
+ }
50
 
51
 
52
  @app.post("/generate")
 
61
  if request.stream:
62
  # Streaming response
63
  async def stream_generator():
64
+ # Run sync generator in thread pool to not block event loop
65
  loop = asyncio.get_event_loop()
 
 
 
 
 
66
 
67
+ # Use run_in_executor for CPU-bound operations
68
+ def sync_generator():
69
+ return generate_stream(
70
+ final_prompt,
71
+ temperature=request.temperature,
72
+ max_tokens=request.max_tokens
73
+ )
74
+
75
+ # Get the generator
76
+ sync_gen = await loop.run_in_executor(None, sync_generator)
77
+
78
+ # Iterate through chunks
79
  for chunk in sync_gen:
80
  if chunk:
81
  # SSE format
 
92
  }
93
  )
94
  else:
95
+ # Non-streaming response - run in executor to not block
96
+ loop = asyncio.get_event_loop()
97
+ result = await loop.run_in_executor(
98
+ None,
99
+ lambda: generate(
100
+ final_prompt,
101
+ temperature=request.temperature,
102
+ max_tokens=request.max_tokens
103
+ )
104
  )
105
  return GenerationResponse(text=result)
app/model.py CHANGED
@@ -1,6 +1,7 @@
1
  # app/model.py
2
  """
3
  Model loading and inference utilities for Nanbeige/Nanbeige4.1-3B.
 
4
  Implements singleton pattern to ensure model loads only once.
5
  """
6
 
@@ -9,35 +10,24 @@ import os
9
  from typing import Generator, Optional
10
 
11
  import torch
12
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
13
 
14
  # Global singleton instances
15
  _tokenizer: Optional[AutoTokenizer] = None
16
  _model: Optional[AutoModelForCausalLM] = None
17
 
18
 
19
- def get_quantization_config() -> Optional[BitsAndBytesConfig]:
20
- """
21
- Configure 4-bit quantization for CPU memory efficiency.
22
- Returns None if bitsandbytes is not available or on CPU.
23
- """
24
- try:
25
- # 4-bit quantization config for minimal memory footprint
26
- return BitsAndBytesConfig(
27
- load_in_4bit=True,
28
- bnb_4bit_compute_dtype=torch.float16,
29
- bnb_4bit_quant_type="nf4",
30
- bnb_4bit_use_double_quant=True,
31
- )
32
- except Exception:
33
- return None
34
-
35
-
36
  def load_model() -> tuple[AutoTokenizer, AutoModelForCausalLM]:
37
  """
38
  Load tokenizer and model with singleton pattern.
39
  Loads only on first call, returns cached instances thereafter.
40
 
 
 
 
 
 
 
41
  Returns:
42
  Tuple of (tokenizer, model)
43
  """
@@ -55,32 +45,29 @@ def load_model() -> tuple[AutoTokenizer, AutoModelForCausalLM]:
55
  trust_remote_code=True
56
  )
57
 
58
- # Configure model loading for CPU
59
- # Use torch.float16 for memory efficiency on CPU
60
- model_kwargs = {
61
- "torch_dtype": torch.float16,
62
- "trust_remote_code": True,
63
- "low_cpu_mem_usage": True,
64
- }
65
-
66
- # Try to use quantization if available, otherwise use standard loading
67
- quant_config = get_quantization_config()
68
- if quant_config is not None:
69
- model_kwargs["quantization_config"] = quant_config
70
 
71
- # Load model
 
72
  _model = AutoModelForCausalLM.from_pretrained(
73
  model_name,
74
- **model_kwargs
 
 
 
75
  )
76
 
77
- # Ensure model is in eval mode
 
 
 
78
  _model.eval()
79
 
80
  # Clear cache to free memory
81
  gc.collect()
82
- if torch.cuda.is_available():
83
- torch.cuda.empty_cache()
84
 
85
  return _tokenizer, _model
86
 
@@ -110,21 +97,10 @@ def generate_stream(
110
  add_special_tokens=False
111
  )
112
 
113
- # Move to same device as model
114
- input_ids = inputs.input_ids.to(model.device)
115
 
116
- # Generation parameters optimized for Nanbeige
117
- generation_kwargs = {
118
- "input_ids": input_ids,
119
- "max_new_tokens": max_tokens,
120
- "temperature": temperature,
121
- "top_p": 0.95,
122
- "do_sample": True,
123
- "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
124
- "eos_token_id": tokenizer.eos_token_id,
125
- }
126
-
127
- # Stream generation using generate with streamer
128
  from transformers import TextIteratorStreamer
129
  from threading import Thread
130
 
@@ -133,16 +109,25 @@ def generate_stream(
133
  skip_prompt=True,
134
  skip_special_tokens=True
135
  )
136
- generation_kwargs["streamer"] = streamer
 
 
 
 
 
 
 
 
 
 
137
 
138
  # Run generation in separate thread to enable streaming
139
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
140
  thread.start()
141
 
142
- generated_text = ""
143
  for text in streamer:
144
- generated_text += text
145
- yield text
146
 
147
  thread.join()
148
 
@@ -175,9 +160,9 @@ def generate(
175
  add_special_tokens=False
176
  )
177
 
178
- input_ids = inputs.input_ids.to(model.device)
179
 
180
- # Generate
181
  with torch.no_grad():
182
  output_ids = model.generate(
183
  input_ids,
@@ -185,7 +170,7 @@ def generate(
185
  temperature=temperature,
186
  top_p=0.95,
187
  do_sample=True,
188
- pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
189
  eos_token_id=tokenizer.eos_token_id,
190
  )
191
 
 
1
  # app/model.py
2
  """
3
  Model loading and inference utilities for Nanbeige/Nanbeige4.1-3B.
4
+ CPU-optimized implementation - NO GPU/CUDA code.
5
  Implements singleton pattern to ensure model loads only once.
6
  """
7
 
 
10
  from typing import Generator, Optional
11
 
12
  import torch
13
+ from transformers import AutoModelForCausalLM, AutoTokenizer
14
 
15
  # Global singleton instances
16
  _tokenizer: Optional[AutoTokenizer] = None
17
  _model: Optional[AutoModelForCausalLM] = None
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def load_model() -> tuple[AutoTokenizer, AutoModelForCausalLM]:
21
  """
22
  Load tokenizer and model with singleton pattern.
23
  Loads only on first call, returns cached instances thereafter.
24
 
25
+ CPU Optimization Notes:
26
+ - Use torch.float32 (float16 is 7x slower on CPU)
27
+ - low_cpu_mem_usage=True prevents memory spikes
28
+ - No device_map (CPU pe auto mat use karna)
29
+ - trust_remote_code=True required for Nanbeige models
30
+
31
  Returns:
32
  Tuple of (tokenizer, model)
33
  """
 
45
  trust_remote_code=True
46
  )
47
 
48
+ # Set pad token if not present
49
+ if _tokenizer.pad_token is None:
50
+ _tokenizer.pad_token = _tokenizer.eos_token
51
+ _tokenizer.pad_token_id = _tokenizer.eos_token_id
 
 
 
 
 
 
 
 
52
 
53
+ # CPU-optimized model loading
54
+ # IMPORTANT: Use float32, NOT float16 (float16 is extremely slow on CPU)
55
  _model = AutoModelForCausalLM.from_pretrained(
56
  model_name,
57
+ torch_dtype=torch.float32, # CPU pe float32 best hai
58
+ trust_remote_code=True,
59
+ low_cpu_mem_usage=True, # Memory optimization
60
+ device_map=None, # CPU pe explicit None rakho
61
  )
62
 
63
+ # Explicitly set to CPU (redundant but safe)
64
+ _model = _model.to("cpu")
65
+
66
+ # Evaluation mode for inference
67
  _model.eval()
68
 
69
  # Clear cache to free memory
70
  gc.collect()
 
 
71
 
72
  return _tokenizer, _model
73
 
 
97
  add_special_tokens=False
98
  )
99
 
100
+ # Keep on CPU
101
+ input_ids = inputs.input_ids
102
 
103
+ # Stream generation using TextIteratorStreamer
 
 
 
 
 
 
 
 
 
 
 
104
  from transformers import TextIteratorStreamer
105
  from threading import Thread
106
 
 
109
  skip_prompt=True,
110
  skip_special_tokens=True
111
  )
112
+
113
+ generation_kwargs = {
114
+ "input_ids": input_ids,
115
+ "max_new_tokens": max_tokens,
116
+ "temperature": temperature,
117
+ "top_p": 0.95,
118
+ "do_sample": True,
119
+ "pad_token_id": tokenizer.pad_token_id,
120
+ "eos_token_id": tokenizer.eos_token_id,
121
+ "streamer": streamer,
122
+ }
123
 
124
  # Run generation in separate thread to enable streaming
125
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
126
  thread.start()
127
 
 
128
  for text in streamer:
129
+ if text:
130
+ yield text
131
 
132
  thread.join()
133
 
 
160
  add_special_tokens=False
161
  )
162
 
163
+ input_ids = inputs.input_ids
164
 
165
+ # Generate with no_grad for memory efficiency
166
  with torch.no_grad():
167
  output_ids = model.generate(
168
  input_ids,
 
170
  temperature=temperature,
171
  top_p=0.95,
172
  do_sample=True,
173
+ pad_token_id=tokenizer.pad_token_id,
174
  eos_token_id=tokenizer.eos_token_id,
175
  )
176
 
app/schemas.py CHANGED
@@ -23,7 +23,7 @@ class GenerationRequest(BaseModel):
23
  max_tokens: int = Field(
24
  default=200,
25
  ge=1,
26
- le=2048,
27
  description="Maximum tokens to generate"
28
  )
29
  stream: bool = Field(
 
23
  max_tokens: int = Field(
24
  default=200,
25
  ge=1,
26
+ le=512,
27
  description="Maximum tokens to generate"
28
  )
29
  stream: bool = Field(
requirements.txt CHANGED
@@ -6,6 +6,5 @@ transformers==4.46.0
6
  torch==2.5.0
7
  accelerate==1.0.0
8
  sentencepiece==0.2.0
9
- bitsandbytes==0.44.0
10
  huggingface-hub==0.26.0
11
  python-multipart==0.0.12
 
6
  torch==2.5.0
7
  accelerate==1.0.0
8
  sentencepiece==0.2.0
 
9
  huggingface-hub==0.26.0
10
  python-multipart==0.0.12
run.sh CHANGED
@@ -1,6 +1,11 @@
1
  #!/bin/bash
2
  # run.sh
3
  # Production startup script for uvicorn server
 
 
 
 
 
4
 
5
  exec uvicorn app.main:app \
6
  --host 0.0.0.0 \
 
1
  #!/bin/bash
2
  # run.sh
3
  # Production startup script for uvicorn server
4
+ # Optimized for CPU-only Hugging Face Spaces
5
+
6
+ export OMP_NUM_THREADS=4
7
+ export MKL_NUM_THREADS=4
8
+ export TRANSFORMERS_CACHE=/tmp/.cache/huggingface
9
 
10
  exec uvicorn app.main:app \
11
  --host 0.0.0.0 \