Karan6933 commited on
Commit
65a1dcc
·
verified ·
1 Parent(s): a17c086

Upload 6 files

Browse files
Files changed (6) hide show
  1. Dockerfile +30 -10
  2. batcher.py +29 -0
  3. bridge.py +73 -0
  4. engine.py +100 -0
  5. main.py +94 -0
  6. setup_model.sh +2 -1
Dockerfile CHANGED
@@ -1,27 +1,47 @@
 
1
  FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install ONLY runtime dependencies (no build tools!)
6
  RUN apt-get update && apt-get install -y \
7
  libopenblas-dev \
8
- libgomp1 \
9
- wget \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
- # Install llama-cpp-python from PREBUILT wheel (3 seconds, no compilation)
13
- RUN pip install --no-cache-dir \
 
14
  https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.22-cp310-cp310-linux_x86_64.whl
15
 
16
- # Install other Python deps
17
  COPY requirements.txt .
18
- RUN pip install --no-cache-dir -r requirements.txt
19
 
20
- # Copy application
 
 
 
21
  COPY . .
22
 
23
- WORKDIR /app/api
 
 
 
 
 
 
 
 
 
 
24
 
 
 
 
 
 
 
 
25
  EXPOSE 8000
26
 
27
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
 
 
1
+ # Use a lightweight Python base
2
  FROM python:3.10-slim
3
 
4
  WORKDIR /app
5
 
6
+ # Install system dependencies
7
  RUN apt-get update && apt-get install -y \
8
  libopenblas-dev \
 
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
+
12
+ # Install llama-cpp-python from PREBUILT wheel (3 seconds vs 10+ minutes)
13
+ RUN pip install \
14
  https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.22-cp310-cp310-linux_x86_64.whl
15
 
16
+ # Copy requirements first for cache
17
  COPY requirements.txt .
 
18
 
19
+ # Install remaining requirements
20
+ RUN pip install -r requirements.txt
21
+
22
+ # Copy project files
23
  COPY . .
24
 
25
+ # Setup environment variables for compilation
26
+ # We need to find where pip installed llama-cpp-python to link against it
27
+ # In docker, it's usually /usr/local/lib/python3.12/site-packages
28
+ # ENV SITE_PACKAGES=/usr/local/lib/python3.12/site-packages
29
+
30
+ # Compile the engine
31
+ # WORKDIR /app/engine
32
+ # RUN g++ -O2 -shared -fPIC -o libbatch.so batch_server.cpp \
33
+ # -I"${SITE_PACKAGES}/include" \
34
+ # -L"${SITE_PACKAGES}/llama_cpp/lib" \
35
+ # -lllama -Wl,-rpath,"${SITE_PACKAGES}/llama_cpp/lib"
36
 
37
+ # Setup Model (Download during build or mount volume?
38
+ # Best practice: Download in build if small, or use script at runtime.
39
+ # Here we'll rely on the user mounting the model or running the setup script.
40
+ # But for "Tunnel Code Optimized", let's assume valid model is present or downloaded.
41
+ # We'll expose the setup script.)
42
+
43
+ WORKDIR /app
44
  EXPOSE 8000
45
 
46
+ # Start command
47
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
batcher.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ class BatchScheduler:
4
+ def __init__(self, max_batch=8, max_wait_ms=30):
5
+ self.queue = []
6
+ self.max_batch = max_batch
7
+ self.max_wait_ms = max_wait_ms
8
+ self.lock = asyncio.Lock()
9
+
10
+ async def add(self, prompt: str):
11
+ # Create a queue for streaming tokens
12
+ queue = asyncio.Queue()
13
+ async with self.lock:
14
+ self.queue.append((prompt, queue))
15
+ return queue
16
+
17
+ async def get_batch(self):
18
+ if not self.queue:
19
+ return None
20
+
21
+ # Artificial wait to accumulate requests
22
+ await asyncio.sleep(self.max_wait_ms / 1000)
23
+
24
+ async with self.lock:
25
+ # Take up to max_batch items from the queue
26
+ batch = self.queue[:self.max_batch]
27
+ self.queue = self.queue[self.max_batch:]
28
+
29
+ return batch if batch else None
bridge.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import os
3
+
4
+ # Load the shared library
5
+ LIB_PATH = os.path.abspath("../engine/libbatch.so")
6
+ if not os.path.exists(LIB_PATH):
7
+ raise FileNotFoundError(f"Shared library not found at: {LIB_PATH}. Did you compile the engine?")
8
+ lib = ctypes.CDLL(LIB_PATH)
9
+
10
+ # Define function signatures
11
+ lib.init_model.argtypes = [ctypes.c_char_p]
12
+ lib.init_model.restype = ctypes.c_bool
13
+
14
+ # Define function signatures for streaming
15
+ lib.start_batch.argtypes = [
16
+ ctypes.POINTER(ctypes.c_char_p), # prompts
17
+ ctypes.c_int, # count
18
+ ctypes.c_int # max_tokens
19
+ ]
20
+ lib.start_batch.restype = None
21
+
22
+ lib.decode_step.argtypes = [
23
+ ctypes.POINTER(ctypes.c_char_p) # results
24
+ ]
25
+ lib.decode_step.restype = ctypes.c_bool
26
+
27
+ # Load template
28
+ with open("../model/template.txt", "r") as f:
29
+ TEMPLATE = f.read()
30
+
31
+ def format_prompt(prompt: str) -> str:
32
+ return TEMPLATE.replace("{{prompt}}", prompt)
33
+
34
+ # Initialize the model
35
+ MODEL_PATH = os.path.abspath("../model/model.gguf").encode('utf-8')
36
+ if not lib.init_model(MODEL_PATH):
37
+ print(f"Failed to initialize model at {MODEL_PATH}")
38
+
39
+ def stream_batch(prompts):
40
+ count = len(prompts)
41
+
42
+ # Apply Ollama-style templates
43
+ formatted_prompts = [format_prompt(p) for p in prompts]
44
+
45
+ c_prompts = (ctypes.c_char_p * count)(*[p.encode('utf-8') for p in formatted_prompts])
46
+ c_results = (ctypes.c_char_p * count)()
47
+
48
+ # 1. Start Batch (Prefill)
49
+ lib.start_batch(c_prompts, count, 256)
50
+
51
+ # 2. Decode Loop
52
+ while True:
53
+ # Run one step
54
+ active = lib.decode_step(c_results)
55
+
56
+ # Collect results for this step
57
+ step_output = []
58
+ for i in range(count):
59
+ res = c_results[i]
60
+ if res:
61
+ text = res.decode('utf-8')
62
+ step_output.append(text)
63
+ # libc.free(res) # Ideally free, but for now we rely on OS cleanup or leak small amount in this demo
64
+ else:
65
+ step_output.append(None)
66
+
67
+ yield step_output
68
+
69
+ if not active:
70
+ break
71
+
72
+
73
+
engine.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from typing import List, AsyncGenerator, Dict
3
+ from llama_cpp import Llama, LlamaGrammar
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class BatchInferenceEngine:
9
+ """
10
+ Pure Python batch inference engine using llama-cpp-python.
11
+ Loads model once, handles multiple concurrent requests efficiently.
12
+ """
13
+
14
+ def __init__(self, model_path: str, n_ctx: int = 4096, n_threads: int = 4):
15
+ self.model_path = model_path
16
+ self.n_ctx = n_ctx
17
+ self.n_threads = n_threads
18
+ self._model: Llama = None
19
+ self._lock = asyncio.Lock()
20
+
21
+ def load(self):
22
+ """Load model once at startup"""
23
+ logger.info(f"Loading model from {self.model_path}")
24
+ self._model = Llama(
25
+ model_path=self.model_path,
26
+ n_ctx=self.n_ctx,
27
+ n_threads=self.n_threads,
28
+ n_batch=512,
29
+ verbose=False
30
+ )
31
+ logger.info("Model loaded successfully")
32
+
33
+ async def generate_stream(
34
+ self,
35
+ prompt: str,
36
+ max_tokens: int = 256,
37
+ temperature: float = 0.7,
38
+ stop: List[str] = None
39
+ ) -> AsyncGenerator[str, None]:
40
+ """
41
+ Async streaming generator for single request.
42
+ Uses thread pool to run sync llama-cpp in background.
43
+ """
44
+ if self._model is None:
45
+ raise RuntimeError("Model not loaded")
46
+
47
+ # Run blocking llama-cpp call in thread pool
48
+ loop = asyncio.get_event_loop()
49
+
50
+ def _generate():
51
+ return self._model.create_completion(
52
+ prompt=prompt,
53
+ max_tokens=max_tokens,
54
+ temperature=temperature,
55
+ stop=stop or [],
56
+ stream=True # Enable streaming
57
+ )
58
+
59
+ # Get streaming iterator
60
+ stream = await loop.run_in_executor(None, _generate)
61
+
62
+ # Yield tokens as they arrive
63
+ for chunk in stream:
64
+ if "choices" in chunk and len(chunk["choices"]) > 0:
65
+ delta = chunk["choices"][0].get("text", "")
66
+ if delta:
67
+ yield delta
68
+
69
+ async def generate_batch(
70
+ self,
71
+ prompts: List[str],
72
+ max_tokens: int = 256,
73
+ temperature: float = 0.7
74
+ ) -> List[str]:
75
+ """
76
+ Process multiple prompts efficiently.
77
+ On CPU, we process sequentially to avoid contention.
78
+ """
79
+ results = []
80
+ for prompt in prompts:
81
+ chunks = []
82
+ async for token in self.generate_stream(prompt, max_tokens, temperature):
83
+ chunks.append(token)
84
+ results.append("".join(chunks))
85
+ return results
86
+
87
+ # Global singleton instance
88
+ _engine: BatchInferenceEngine = None
89
+
90
+ def get_engine() -> BatchInferenceEngine:
91
+ global _engine
92
+ if _engine is None:
93
+ raise RuntimeError("Engine not initialized")
94
+ return _engine
95
+
96
+ def init_engine(model_path: str, **kwargs):
97
+ global _engine
98
+ _engine = BatchInferenceEngine(model_path, **kwargs)
99
+ _engine.load()
100
+ return _engine
main.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ from contextlib import asynccontextmanager
4
+ from fastapi import FastAPI, HTTPException
5
+ from fastapi.responses import StreamingResponse
6
+ from pydantic import BaseModel
7
+ from typing import List, Optional
8
+ import logging
9
+
10
+ from engine import init_engine, get_engine
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Configuration
16
+ MODEL_PATH = os.getenv("MODEL_PATH", "model/model.gguf")
17
+ MODEL_URL = os.getenv("MODEL_URL", "https://huggingface.co/prithivMLmods/Nanbeige4.1-3B-f32-GGUF/resolve/main/Nanbeige4.1-3B.Q8_0.gguf")
18
+
19
+ class GenerateRequest(BaseModel):
20
+ prompt: str
21
+ max_tokens: int = 256
22
+ temperature: float = 0.7
23
+ stream: bool = True
24
+
25
+ class BatchRequest(BaseModel):
26
+ prompts: List[str]
27
+ max_tokens: int = 256
28
+ temperature: float = 0.7
29
+
30
+ def download_model():
31
+ """Download model if not exists"""
32
+ if not os.path.exists(MODEL_PATH):
33
+ os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
34
+ logger.info(f"Downloading model from {MODEL_URL}")
35
+ import urllib.request
36
+ urllib.request.urlretrieve(MODEL_URL, MODEL_PATH)
37
+ logger.info("Model downloaded")
38
+
39
+ @asynccontextmanager
40
+ async def lifespan(app: FastAPI):
41
+ # Startup
42
+ logger.info("Starting up...")
43
+ download_model()
44
+ init_engine(MODEL_PATH, n_ctx=4096, n_threads=4)
45
+ logger.info("Ready!")
46
+ yield
47
+ # Shutdown
48
+ logger.info("Shutting down...")
49
+
50
+ app = FastAPI(title="Nanbeige LLM API", lifespan=lifespan)
51
+
52
+ @app.post("/generate")
53
+ async def generate(req: GenerateRequest):
54
+ """Single prompt generation with streaming"""
55
+ engine = get_engine()
56
+
57
+ if req.stream:
58
+ async def stream_generator():
59
+ async for token in engine.generate_stream(
60
+ req.prompt,
61
+ max_tokens=req.max_tokens,
62
+ temperature=req.temperature
63
+ ):
64
+ yield token
65
+
66
+ return StreamingResponse(
67
+ stream_generator(),
68
+ media_type="text/plain"
69
+ )
70
+ else:
71
+ # Non-streaming: collect all tokens
72
+ chunks = []
73
+ async for token in engine.generate_stream(
74
+ req.prompt,
75
+ max_tokens=req.max_tokens,
76
+ temperature=req.temperature
77
+ ):
78
+ chunks.append(token)
79
+ return {"text": "".join(chunks)}
80
+
81
+ @app.post("/generate_batch")
82
+ async def generate_batch(req: BatchRequest):
83
+ """Batch generation (multiple prompts)"""
84
+ engine = get_engine()
85
+ results = await engine.generate_batch(
86
+ req.prompts,
87
+ max_tokens=req.max_tokens,
88
+ temperature=req.temperature
89
+ )
90
+ return {"results": results}
91
+
92
+ @app.get("/health")
93
+ async def health():
94
+ return {"status": "ok", "model_loaded": get_engine()._model is not None}
setup_model.sh CHANGED
@@ -2,7 +2,8 @@
2
  set -e
3
 
4
  # Default URL (Nanbeige4.1-3B-f32-GGUF - Q8_0)
5
- DEFAULT_URL="https://huggingface.co/prithivMLmods/Nanbeige4.1-3B-f32-GGUF/resolve/main/Nanbeige4.1-3B.Q8_0.gguf"
 
6
  MODEL_URL=${1:-$DEFAULT_URL}
7
  MODEL_DIR="model"
8
  ENGINE_DIR="engine"
 
2
  set -e
3
 
4
  # Default URL (Nanbeige4.1-3B-f32-GGUF - Q8_0)
5
+ # DEFAULT_URL="https://huggingface.co/prithivMLmods/Nanbeige4.1-3B-f32-GGUF/resolve/main/Nanbeige4.1-3B.Q8_0.gguf"
6
+ DEFAULT_URL="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/blob/main/Phi-3-mini-4k-instruct-q4.gguf"
7
  MODEL_URL=${1:-$DEFAULT_URL}
8
  MODEL_DIR="model"
9
  ENGINE_DIR="engine"