Spaces:

Karan6933
/

abcd

Build error

App Files Files Community

Karan6933 commited on 3 days ago

Commit

65a1dcc

verified ·

1 Parent(s): a17c086

Upload 6 files

Browse files

Files changed (6) hide show

Dockerfile +30 -10
batcher.py +29 -0
bridge.py +73 -0
engine.py +100 -0
main.py +94 -0
setup_model.sh +2 -1

Dockerfile CHANGED Viewed

@@ -1,27 +1,47 @@
 FROM python:3.10-slim
 WORKDIR /app
-# Install ONLY runtime dependencies (no build tools!)
 RUN apt-get update && apt-get install -y \
     libopenblas-dev \
-    libgomp1 \
-    wget \
     && rm -rf /var/lib/apt/lists/*
-# Install llama-cpp-python from PREBUILT wheel (3 seconds, no compilation)
-RUN pip install --no-cache-dir \
     https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.22-cp310-cp310-linux_x86_64.whl
-# Install other Python deps
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy application
 COPY . .
-WORKDIR /app/api
 EXPOSE 8000
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

+# Use a lightweight Python base
 FROM python:3.10-slim
 WORKDIR /app
+# Install system dependencies
 RUN apt-get update && apt-get install -y \
     libopenblas-dev \
     && rm -rf /var/lib/apt/lists/*
+# Install llama-cpp-python from PREBUILT wheel (3 seconds vs 10+ minutes)
+RUN pip install \
     https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.22-cp310-cp310-linux_x86_64.whl
+# Copy requirements first for cache
 COPY requirements.txt .
+# Install remaining requirements
+RUN pip install -r requirements.txt
+# Copy project files
 COPY . .
+# Setup environment variables for compilation
+# We need to find where pip installed llama-cpp-python to link against it
+# In docker, it's usually /usr/local/lib/python3.12/site-packages
+# ENV SITE_PACKAGES=/usr/local/lib/python3.12/site-packages
+# Compile the engine
+# WORKDIR /app/engine
+# RUN g++ -O2 -shared -fPIC -o libbatch.so batch_server.cpp \
+#     -I"${SITE_PACKAGES}/include" \
+#     -L"${SITE_PACKAGES}/llama_cpp/lib" \
+#     -lllama -Wl,-rpath,"${SITE_PACKAGES}/llama_cpp/lib"
+# Setup Model (Download during build or mount volume?
+# Best practice: Download in build if small, or use script at runtime.
+# Here we'll rely on the user mounting the model or running the setup script.
+# But for "Tunnel Code Optimized", let's assume valid model is present or downloaded.
+# We'll expose the setup script.)
+WORKDIR /app
 EXPOSE 8000
+# Start command
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

batcher.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import asyncio
+class BatchScheduler:
+    def __init__(self, max_batch=8, max_wait_ms=30):
+        self.queue = []
+        self.max_batch = max_batch
+        self.max_wait_ms = max_wait_ms
+        self.lock = asyncio.Lock()
+    async def add(self, prompt: str):
+        # Create a queue for streaming tokens
+        queue = asyncio.Queue()
+        async with self.lock:
+            self.queue.append((prompt, queue))
+        return queue
+    async def get_batch(self):
+        if not self.queue:
+            return None
+        # Artificial wait to accumulate requests
+        await asyncio.sleep(self.max_wait_ms / 1000)
+        async with self.lock:
+            # Take up to max_batch items from the queue
+            batch = self.queue[:self.max_batch]
+            self.queue = self.queue[self.max_batch:]
+        return batch if batch else None

bridge.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import ctypes
+import os
+# Load the shared library
+LIB_PATH = os.path.abspath("../engine/libbatch.so")
+if not os.path.exists(LIB_PATH):
+    raise FileNotFoundError(f"Shared library not found at: {LIB_PATH}. Did you compile the engine?")
+lib = ctypes.CDLL(LIB_PATH)
+# Define function signatures
+lib.init_model.argtypes = [ctypes.c_char_p]
+lib.init_model.restype = ctypes.c_bool
+# Define function signatures for streaming
+lib.start_batch.argtypes = [
+    ctypes.POINTER(ctypes.c_char_p), # prompts
+    ctypes.c_int,                   # count
+    ctypes.c_int                    # max_tokens
+]
+lib.start_batch.restype = None
+lib.decode_step.argtypes = [
+    ctypes.POINTER(ctypes.c_char_p)  # results
+]
+lib.decode_step.restype = ctypes.c_bool
+# Load template
+with open("../model/template.txt", "r") as f:
+    TEMPLATE = f.read()
+def format_prompt(prompt: str) -> str:
+    return TEMPLATE.replace("{{prompt}}", prompt)
+# Initialize the model
+MODEL_PATH = os.path.abspath("../model/model.gguf").encode('utf-8')
+if not lib.init_model(MODEL_PATH):
+    print(f"Failed to initialize model at {MODEL_PATH}")
+def stream_batch(prompts):
+    count = len(prompts)
+    # Apply Ollama-style templates
+    formatted_prompts = [format_prompt(p) for p in prompts]
+    c_prompts = (ctypes.c_char_p * count)(*[p.encode('utf-8') for p in formatted_prompts])
+    c_results = (ctypes.c_char_p * count)()
+    # 1. Start Batch (Prefill)
+    lib.start_batch(c_prompts, count, 256)
+    # 2. Decode Loop
+    while True:
+        # Run one step
+        active = lib.decode_step(c_results)
+        # Collect results for this step
+        step_output = []
+        for i in range(count):
+            res = c_results[i]
+            if res:
+                text = res.decode('utf-8')
+                step_output.append(text)
+                # libc.free(res) # Ideally free, but for now we rely on OS cleanup or leak small amount in this demo
+            else:
+                step_output.append(None)
+        yield step_output
+        if not active:
+            break

engine.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import asyncio
+from typing import List, AsyncGenerator, Dict
+from llama_cpp import Llama, LlamaGrammar
+import logging
+logger = logging.getLogger(__name__)
+class BatchInferenceEngine:
+    """
+    Pure Python batch inference engine using llama-cpp-python.
+    Loads model once, handles multiple concurrent requests efficiently.
+    """
+    def __init__(self, model_path: str, n_ctx: int = 4096, n_threads: int = 4):
+        self.model_path = model_path
+        self.n_ctx = n_ctx
+        self.n_threads = n_threads
+        self._model: Llama = None
+        self._lock = asyncio.Lock()
+    def load(self):
+        """Load model once at startup"""
+        logger.info(f"Loading model from {self.model_path}")
+        self._model = Llama(
+            model_path=self.model_path,
+            n_ctx=self.n_ctx,
+            n_threads=self.n_threads,
+            n_batch=512,
+            verbose=False
+        )
+        logger.info("Model loaded successfully")
+    async def generate_stream(
+        self,
+        prompt: str,
+        max_tokens: int = 256,
+        temperature: float = 0.7,
+        stop: List[str] = None
+    ) -> AsyncGenerator[str, None]:
+        """
+        Async streaming generator for single request.
+        Uses thread pool to run sync llama-cpp in background.
+        """
+        if self._model is None:
+            raise RuntimeError("Model not loaded")
+        # Run blocking llama-cpp call in thread pool
+        loop = asyncio.get_event_loop()
+        def _generate():
+            return self._model.create_completion(
+                prompt=prompt,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                stop=stop or [],
+                stream=True  # Enable streaming
+            )
+        # Get streaming iterator
+        stream = await loop.run_in_executor(None, _generate)
+        # Yield tokens as they arrive
+        for chunk in stream:
+            if "choices" in chunk and len(chunk["choices"]) > 0:
+                delta = chunk["choices"][0].get("text", "")
+                if delta:
+                    yield delta
+    async def generate_batch(
+        self,
+        prompts: List[str],
+        max_tokens: int = 256,
+        temperature: float = 0.7
+    ) -> List[str]:
+        """
+        Process multiple prompts efficiently.
+        On CPU, we process sequentially to avoid contention.
+        """
+        results = []
+        for prompt in prompts:
+            chunks = []
+            async for token in self.generate_stream(prompt, max_tokens, temperature):
+                chunks.append(token)
+            results.append("".join(chunks))
+        return results
+# Global singleton instance
+_engine: BatchInferenceEngine = None
+def get_engine() -> BatchInferenceEngine:
+    global _engine
+    if _engine is None:
+        raise RuntimeError("Engine not initialized")
+    return _engine
+def init_engine(model_path: str, **kwargs):
+    global _engine
+    _engine = BatchInferenceEngine(model_path, **kwargs)
+    _engine.load()
+    return _engine

main.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import asyncio
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from typing import List, Optional
+import logging
+from engine import init_engine, get_engine
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Configuration
+MODEL_PATH = os.getenv("MODEL_PATH", "model/model.gguf")
+MODEL_URL = os.getenv("MODEL_URL", "https://huggingface.co/prithivMLmods/Nanbeige4.1-3B-f32-GGUF/resolve/main/Nanbeige4.1-3B.Q8_0.gguf")
+class GenerateRequest(BaseModel):
+    prompt: str
+    max_tokens: int = 256
+    temperature: float = 0.7
+    stream: bool = True
+class BatchRequest(BaseModel):
+    prompts: List[str]
+    max_tokens: int = 256
+    temperature: float = 0.7
+def download_model():
+    """Download model if not exists"""
+    if not os.path.exists(MODEL_PATH):
+        os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
+        logger.info(f"Downloading model from {MODEL_URL}")
+        import urllib.request
+        urllib.request.urlretrieve(MODEL_URL, MODEL_PATH)
+        logger.info("Model downloaded")
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    logger.info("Starting up...")
+    download_model()
+    init_engine(MODEL_PATH, n_ctx=4096, n_threads=4)
+    logger.info("Ready!")
+    yield
+    # Shutdown
+    logger.info("Shutting down...")
+app = FastAPI(title="Nanbeige LLM API", lifespan=lifespan)
+@app.post("/generate")
+async def generate(req: GenerateRequest):
+    """Single prompt generation with streaming"""
+    engine = get_engine()
+    if req.stream:
+        async def stream_generator():
+            async for token in engine.generate_stream(
+                req.prompt,
+                max_tokens=req.max_tokens,
+                temperature=req.temperature
+            ):
+                yield token
+        return StreamingResponse(
+            stream_generator(),
+            media_type="text/plain"
+        )
+    else:
+        # Non-streaming: collect all tokens
+        chunks = []
+        async for token in engine.generate_stream(
+            req.prompt,
+            max_tokens=req.max_tokens,
+            temperature=req.temperature
+        ):
+            chunks.append(token)
+        return {"text": "".join(chunks)}
+@app.post("/generate_batch")
+async def generate_batch(req: BatchRequest):
+    """Batch generation (multiple prompts)"""
+    engine = get_engine()
+    results = await engine.generate_batch(
+        req.prompts,
+        max_tokens=req.max_tokens,
+        temperature=req.temperature
+    )
+    return {"results": results}
+@app.get("/health")
+async def health():
+    return {"status": "ok", "model_loaded": get_engine()._model is not None}

setup_model.sh CHANGED Viewed

@@ -2,7 +2,8 @@
 set -e
 # Default URL (Nanbeige4.1-3B-f32-GGUF - Q8_0)
-DEFAULT_URL="https://huggingface.co/prithivMLmods/Nanbeige4.1-3B-f32-GGUF/resolve/main/Nanbeige4.1-3B.Q8_0.gguf"
 MODEL_URL=${1:-$DEFAULT_URL}
 MODEL_DIR="model"
 ENGINE_DIR="engine"

 set -e
 # Default URL (Nanbeige4.1-3B-f32-GGUF - Q8_0)
+# DEFAULT_URL="https://huggingface.co/prithivMLmods/Nanbeige4.1-3B-f32-GGUF/resolve/main/Nanbeige4.1-3B.Q8_0.gguf"
+DEFAULT_URL="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/blob/main/Phi-3-mini-4k-instruct-q4.gguf"
 MODEL_URL=${1:-$DEFAULT_URL}
 MODEL_DIR="model"
 ENGINE_DIR="engine"