Upload 17 files
Browse files- Dockerfile +41 -0
- api/__pycache__/batcher.cpython-312.pyc +0 -0
- api/__pycache__/bridge.cpython-312.pyc +0 -0
- api/__pycache__/engine.cpython-312.pyc +0 -0
- api/__pycache__/main.cpython-312.pyc +0 -0
- api/__pycache__/server.cpython-312.pyc +0 -0
- api/batcher.py +29 -0
- api/bridge.py +73 -0
- api/main.py +81 -0
- api/server_logs.txt +273 -0
- engine/CMakeLists.txt +20 -0
- engine/batch_server.cpp +184 -0
- engine/libbatch.so +0 -0
- model/config.json +6 -0
- model/template.txt +7 -0
- requirements.txt +3 -0
- setup_model.sh +56 -0
Dockerfile
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use a lightweight Python base
|
| 2 |
+
FROM python:3.12-slim
|
| 3 |
+
|
| 4 |
+
# Install build tools
|
| 5 |
+
RUN apt-get update && apt-get install -y \
|
| 6 |
+
build-essential \
|
| 7 |
+
wget \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
WORKDIR /app
|
| 11 |
+
|
| 12 |
+
# Copy requirements first for cache
|
| 13 |
+
COPY requirements.txt .
|
| 14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
+
|
| 16 |
+
# Copy project files
|
| 17 |
+
COPY . .
|
| 18 |
+
|
| 19 |
+
# Setup environment variables for compilation
|
| 20 |
+
# We need to find where pip installed llama-cpp-python to link against it
|
| 21 |
+
# In docker, it's usually /usr/local/lib/python3.12/site-packages
|
| 22 |
+
ENV SITE_PACKAGES=/usr/local/lib/python3.12/site-packages
|
| 23 |
+
|
| 24 |
+
# Compile the engine
|
| 25 |
+
WORKDIR /app/engine
|
| 26 |
+
RUN g++ -O2 -shared -fPIC -o libbatch.so batch_server.cpp \
|
| 27 |
+
-I"${SITE_PACKAGES}/include" \
|
| 28 |
+
-L"${SITE_PACKAGES}/llama_cpp/lib" \
|
| 29 |
+
-lllama -Wl,-rpath,"${SITE_PACKAGES}/llama_cpp/lib"
|
| 30 |
+
|
| 31 |
+
# Setup Model (Download during build or mount volume?
|
| 32 |
+
# Best practice: Download in build if small, or use script at runtime.
|
| 33 |
+
# Here we'll rely on the user mounting the model or running the setup script.
|
| 34 |
+
# But for "Tunnel Code Optimized", let's assume valid model is present or downloaded.
|
| 35 |
+
# We'll expose the setup script.)
|
| 36 |
+
|
| 37 |
+
WORKDIR /app/api
|
| 38 |
+
EXPOSE 8000
|
| 39 |
+
|
| 40 |
+
# Start command
|
| 41 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
api/__pycache__/batcher.cpython-312.pyc
ADDED
|
Binary file (2.05 kB). View file
|
|
|
api/__pycache__/bridge.cpython-312.pyc
ADDED
|
Binary file (3.03 kB). View file
|
|
|
api/__pycache__/engine.cpython-312.pyc
ADDED
|
Binary file (1.78 kB). View file
|
|
|
api/__pycache__/main.cpython-312.pyc
ADDED
|
Binary file (3.32 kB). View file
|
|
|
api/__pycache__/server.cpython-312.pyc
ADDED
|
Binary file (2.06 kB). View file
|
|
|
api/batcher.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
|
| 3 |
+
class BatchScheduler:
|
| 4 |
+
def __init__(self, max_batch=8, max_wait_ms=30):
|
| 5 |
+
self.queue = []
|
| 6 |
+
self.max_batch = max_batch
|
| 7 |
+
self.max_wait_ms = max_wait_ms
|
| 8 |
+
self.lock = asyncio.Lock()
|
| 9 |
+
|
| 10 |
+
async def add(self, prompt: str):
|
| 11 |
+
# Create a queue for streaming tokens
|
| 12 |
+
queue = asyncio.Queue()
|
| 13 |
+
async with self.lock:
|
| 14 |
+
self.queue.append((prompt, queue))
|
| 15 |
+
return queue
|
| 16 |
+
|
| 17 |
+
async def get_batch(self):
|
| 18 |
+
if not self.queue:
|
| 19 |
+
return None
|
| 20 |
+
|
| 21 |
+
# Artificial wait to accumulate requests
|
| 22 |
+
await asyncio.sleep(self.max_wait_ms / 1000)
|
| 23 |
+
|
| 24 |
+
async with self.lock:
|
| 25 |
+
# Take up to max_batch items from the queue
|
| 26 |
+
batch = self.queue[:self.max_batch]
|
| 27 |
+
self.queue = self.queue[self.max_batch:]
|
| 28 |
+
|
| 29 |
+
return batch if batch else None
|
api/bridge.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ctypes
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Load the shared library
|
| 5 |
+
LIB_PATH = os.path.abspath("../engine/libbatch.so")
|
| 6 |
+
if not os.path.exists(LIB_PATH):
|
| 7 |
+
raise FileNotFoundError(f"Shared library not found at: {LIB_PATH}. Did you compile the engine?")
|
| 8 |
+
lib = ctypes.CDLL(LIB_PATH)
|
| 9 |
+
|
| 10 |
+
# Define function signatures
|
| 11 |
+
lib.init_model.argtypes = [ctypes.c_char_p]
|
| 12 |
+
lib.init_model.restype = ctypes.c_bool
|
| 13 |
+
|
| 14 |
+
# Define function signatures for streaming
|
| 15 |
+
lib.start_batch.argtypes = [
|
| 16 |
+
ctypes.POINTER(ctypes.c_char_p), # prompts
|
| 17 |
+
ctypes.c_int, # count
|
| 18 |
+
ctypes.c_int # max_tokens
|
| 19 |
+
]
|
| 20 |
+
lib.start_batch.restype = None
|
| 21 |
+
|
| 22 |
+
lib.decode_step.argtypes = [
|
| 23 |
+
ctypes.POINTER(ctypes.c_char_p) # results
|
| 24 |
+
]
|
| 25 |
+
lib.decode_step.restype = ctypes.c_bool
|
| 26 |
+
|
| 27 |
+
# Load template
|
| 28 |
+
with open("../model/template.txt", "r") as f:
|
| 29 |
+
TEMPLATE = f.read()
|
| 30 |
+
|
| 31 |
+
def format_prompt(prompt: str) -> str:
|
| 32 |
+
return TEMPLATE.replace("{{prompt}}", prompt)
|
| 33 |
+
|
| 34 |
+
# Initialize the model
|
| 35 |
+
MODEL_PATH = os.path.abspath("../model/model.gguf").encode('utf-8')
|
| 36 |
+
if not lib.init_model(MODEL_PATH):
|
| 37 |
+
print(f"Failed to initialize model at {MODEL_PATH}")
|
| 38 |
+
|
| 39 |
+
def stream_batch(prompts):
|
| 40 |
+
count = len(prompts)
|
| 41 |
+
|
| 42 |
+
# Apply Ollama-style templates
|
| 43 |
+
formatted_prompts = [format_prompt(p) for p in prompts]
|
| 44 |
+
|
| 45 |
+
c_prompts = (ctypes.c_char_p * count)(*[p.encode('utf-8') for p in formatted_prompts])
|
| 46 |
+
c_results = (ctypes.c_char_p * count)()
|
| 47 |
+
|
| 48 |
+
# 1. Start Batch (Prefill)
|
| 49 |
+
lib.start_batch(c_prompts, count, 256)
|
| 50 |
+
|
| 51 |
+
# 2. Decode Loop
|
| 52 |
+
while True:
|
| 53 |
+
# Run one step
|
| 54 |
+
active = lib.decode_step(c_results)
|
| 55 |
+
|
| 56 |
+
# Collect results for this step
|
| 57 |
+
step_output = []
|
| 58 |
+
for i in range(count):
|
| 59 |
+
res = c_results[i]
|
| 60 |
+
if res:
|
| 61 |
+
text = res.decode('utf-8')
|
| 62 |
+
step_output.append(text)
|
| 63 |
+
# libc.free(res) # Ideally free, but for now we rely on OS cleanup or leak small amount in this demo
|
| 64 |
+
else:
|
| 65 |
+
step_output.append(None)
|
| 66 |
+
|
| 67 |
+
yield step_output
|
| 68 |
+
|
| 69 |
+
if not active:
|
| 70 |
+
break
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
|
api/main.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from fastapi.responses import StreamingResponse
|
| 3 |
+
from batcher import BatchScheduler
|
| 4 |
+
from bridge import stream_batch
|
| 5 |
+
import asyncio
|
| 6 |
+
import time
|
| 7 |
+
|
| 8 |
+
app = FastAPI()
|
| 9 |
+
scheduler = BatchScheduler(max_batch=8, max_wait_ms=30)
|
| 10 |
+
|
| 11 |
+
# In-memory chat history (per process, for demo)
|
| 12 |
+
chat_histories = {}
|
| 13 |
+
|
| 14 |
+
@app.post("/chat")
|
| 15 |
+
async def chat(prompt: str, session_id: str = "default"):
|
| 16 |
+
# Simple history management
|
| 17 |
+
if session_id not in chat_histories:
|
| 18 |
+
chat_histories[session_id] = []
|
| 19 |
+
|
| 20 |
+
# Contextual prompt construction
|
| 21 |
+
history = "\n".join(chat_histories[session_id])
|
| 22 |
+
if history:
|
| 23 |
+
full_prompt = f"{history}\n{prompt}"
|
| 24 |
+
else:
|
| 25 |
+
full_prompt = prompt
|
| 26 |
+
|
| 27 |
+
# Get the queue for this request
|
| 28 |
+
token_queue = await scheduler.add(full_prompt)
|
| 29 |
+
|
| 30 |
+
# Generator to yield tokens from the queue
|
| 31 |
+
async def response_generator():
|
| 32 |
+
full_response = []
|
| 33 |
+
while True:
|
| 34 |
+
token = await token_queue.get()
|
| 35 |
+
if token is None:
|
| 36 |
+
break
|
| 37 |
+
yield token
|
| 38 |
+
full_response.append(token)
|
| 39 |
+
|
| 40 |
+
# After streaming is done, update history
|
| 41 |
+
# Note: This runs after the response closes, might need background task if strict
|
| 42 |
+
# But for generator, code continues after yield
|
| 43 |
+
response_text = "".join(full_response)
|
| 44 |
+
chat_histories[session_id].append(f"User: {prompt}")
|
| 45 |
+
chat_histories[session_id].append(f"AI: {response_text}")
|
| 46 |
+
|
| 47 |
+
# Keep history concise
|
| 48 |
+
if len(chat_histories[session_id]) > 10:
|
| 49 |
+
chat_histories[session_id] = chat_histories[session_id][-10:]
|
| 50 |
+
|
| 51 |
+
return StreamingResponse(response_generator(), media_type="text/plain")
|
| 52 |
+
|
| 53 |
+
async def batch_loop():
|
| 54 |
+
print("Batch loop started...")
|
| 55 |
+
while True:
|
| 56 |
+
# Wait for a batch
|
| 57 |
+
batch = await scheduler.get_batch()
|
| 58 |
+
if not batch:
|
| 59 |
+
await asyncio.sleep(0.01) # Short sleep if empty
|
| 60 |
+
continue
|
| 61 |
+
|
| 62 |
+
# Process batch
|
| 63 |
+
prompts, queues = zip(*batch)
|
| 64 |
+
print(f"Processing batch of {len(prompts)} prompts")
|
| 65 |
+
|
| 66 |
+
# Stream from C++ engine
|
| 67 |
+
# Iterate over the generator which yields step-by-step tokens
|
| 68 |
+
for step_tokens in stream_batch(prompts):
|
| 69 |
+
for q, token in zip(queues, step_tokens):
|
| 70 |
+
if token is not None:
|
| 71 |
+
q.put_nowait(token)
|
| 72 |
+
# Yield control to event loop to let FastAPI flush tokens
|
| 73 |
+
await asyncio.sleep(0)
|
| 74 |
+
|
| 75 |
+
# Signal done
|
| 76 |
+
for q in queues:
|
| 77 |
+
q.put_nowait(None)
|
| 78 |
+
|
| 79 |
+
@app.on_event("startup")
|
| 80 |
+
async def startup_event():
|
| 81 |
+
asyncio.create_task(batch_loop())
|
api/server_logs.txt
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
llama_model_loader: loaded meta data with 24 key-value pairs and 195 tensors from /home/karanpc/Desktop/Ollama Copy/ultra-fast-llm-pro/model/model.gguf (version GGUF V3 (latest))
|
| 2 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
| 3 |
+
llama_model_loader: - kv 0: general.architecture str = phi3
|
| 4 |
+
llama_model_loader: - kv 1: general.name str = Phi3
|
| 5 |
+
llama_model_loader: - kv 2: phi3.context_length u32 = 4096
|
| 6 |
+
llama_model_loader: - kv 3: phi3.embedding_length u32 = 3072
|
| 7 |
+
llama_model_loader: - kv 4: phi3.feed_forward_length u32 = 8192
|
| 8 |
+
llama_model_loader: - kv 5: phi3.block_count u32 = 32
|
| 9 |
+
llama_model_loader: - kv 6: phi3.attention.head_count u32 = 32
|
| 10 |
+
llama_model_loader: - kv 7: phi3.attention.head_count_kv u32 = 32
|
| 11 |
+
llama_model_loader: - kv 8: phi3.attention.layer_norm_rms_epsilon f32 = 0.000010
|
| 12 |
+
llama_model_loader: - kv 9: phi3.rope.dimension_count u32 = 96
|
| 13 |
+
llama_model_loader: - kv 10: general.file_type u32 = 15
|
| 14 |
+
llama_model_loader: - kv 11: tokenizer.ggml.model str = llama
|
| 15 |
+
llama_model_loader: - kv 12: tokenizer.ggml.pre str = default
|
| 16 |
+
llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32064] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
|
| 17 |
+
llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32064] = [0.000000, 0.000000, 0.000000, 0.0000...
|
| 18 |
+
llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32064] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
|
| 19 |
+
llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1
|
| 20 |
+
llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 32000
|
| 21 |
+
llama_model_loader: - kv 18: tokenizer.ggml.unknown_token_id u32 = 0
|
| 22 |
+
llama_model_loader: - kv 19: tokenizer.ggml.padding_token_id u32 = 32000
|
| 23 |
+
llama_model_loader: - kv 20: tokenizer.ggml.add_bos_token bool = true
|
| 24 |
+
llama_model_loader: - kv 21: tokenizer.ggml.add_eos_token bool = false
|
| 25 |
+
llama_model_loader: - kv 22: tokenizer.chat_template str = {{ bos_token }}{% for message in mess...
|
| 26 |
+
llama_model_loader: - kv 23: general.quantization_version u32 = 2
|
| 27 |
+
llama_model_loader: - type f32: 65 tensors
|
| 28 |
+
llama_model_loader: - type q4_K: 81 tensors
|
| 29 |
+
llama_model_loader: - type q5_K: 32 tensors
|
| 30 |
+
llama_model_loader: - type q6_K: 17 tensors
|
| 31 |
+
print_info: file format = GGUF V3 (latest)
|
| 32 |
+
print_info: file type = Q4_K - Medium
|
| 33 |
+
print_info: file size = 2.23 GiB (5.01 BPW)
|
| 34 |
+
init_tokenizer: initializing tokenizer for type 1
|
| 35 |
+
load: control-looking token: 32007 '<|end|>' was not control-type; this is probably a bug in the model. its type will be overridden
|
| 36 |
+
load: control-looking token: 32000 '<|endoftext|>' was not control-type; this is probably a bug in the model. its type will be overridden
|
| 37 |
+
load: control token: 2 '</s>' is not marked as EOG
|
| 38 |
+
load: control token: 1 '<s>' is not marked as EOG
|
| 39 |
+
load: printing all EOG tokens:
|
| 40 |
+
load: - 32000 ('<|endoftext|>')
|
| 41 |
+
load: - 32007 ('<|end|>')
|
| 42 |
+
load: special tokens cache size = 67
|
| 43 |
+
load: token to piece cache size = 0.1690 MB
|
| 44 |
+
print_info: arch = phi3
|
| 45 |
+
print_info: vocab_only = 0
|
| 46 |
+
print_info: n_ctx_train = 4096
|
| 47 |
+
print_info: n_embd = 3072
|
| 48 |
+
print_info: n_layer = 32
|
| 49 |
+
print_info: n_head = 32
|
| 50 |
+
print_info: n_head_kv = 32
|
| 51 |
+
print_info: n_rot = 96
|
| 52 |
+
print_info: n_swa = 0
|
| 53 |
+
print_info: is_swa_any = 0
|
| 54 |
+
print_info: n_embd_head_k = 96
|
| 55 |
+
print_info: n_embd_head_v = 96
|
| 56 |
+
print_info: n_gqa = 1
|
| 57 |
+
print_info: n_embd_k_gqa = 3072
|
| 58 |
+
print_info: n_embd_v_gqa = 3072
|
| 59 |
+
print_info: f_norm_eps = 0.0e+00
|
| 60 |
+
print_info: f_norm_rms_eps = 1.0e-05
|
| 61 |
+
print_info: f_clamp_kqv = 0.0e+00
|
| 62 |
+
print_info: f_max_alibi_bias = 0.0e+00
|
| 63 |
+
print_info: f_logit_scale = 0.0e+00
|
| 64 |
+
print_info: f_attn_scale = 0.0e+00
|
| 65 |
+
print_info: n_ff = 8192
|
| 66 |
+
print_info: n_expert = 0
|
| 67 |
+
print_info: n_expert_used = 0
|
| 68 |
+
print_info: causal attn = 1
|
| 69 |
+
print_info: pooling type = 0
|
| 70 |
+
print_info: rope type = 2
|
| 71 |
+
print_info: rope scaling = linear
|
| 72 |
+
print_info: freq_base_train = 10000.0
|
| 73 |
+
print_info: freq_scale_train = 1
|
| 74 |
+
print_info: n_ctx_orig_yarn = 4096
|
| 75 |
+
print_info: rope_finetuned = unknown
|
| 76 |
+
print_info: model type = 3B
|
| 77 |
+
print_info: model params = 3.82 B
|
| 78 |
+
print_info: general.name = Phi3
|
| 79 |
+
print_info: vocab type = SPM
|
| 80 |
+
print_info: n_vocab = 32064
|
| 81 |
+
print_info: n_merges = 0
|
| 82 |
+
print_info: BOS token = 1 '<s>'
|
| 83 |
+
print_info: EOS token = 32000 '<|endoftext|>'
|
| 84 |
+
print_info: EOT token = 32007 '<|end|>'
|
| 85 |
+
print_info: UNK token = 0 '<unk>'
|
| 86 |
+
print_info: PAD token = 32000 '<|endoftext|>'
|
| 87 |
+
print_info: LF token = 13 '<0x0A>'
|
| 88 |
+
print_info: EOG token = 32000 '<|endoftext|>'
|
| 89 |
+
print_info: EOG token = 32007 '<|end|>'
|
| 90 |
+
print_info: max token length = 48
|
| 91 |
+
load_tensors: loading model tensors, this can take a while... (mmap = true)
|
| 92 |
+
load_tensors: layer 0 assigned to device CPU, is_swa = 0
|
| 93 |
+
load_tensors: layer 1 assigned to device CPU, is_swa = 0
|
| 94 |
+
load_tensors: layer 2 assigned to device CPU, is_swa = 0
|
| 95 |
+
load_tensors: layer 3 assigned to device CPU, is_swa = 0
|
| 96 |
+
load_tensors: layer 4 assigned to device CPU, is_swa = 0
|
| 97 |
+
load_tensors: layer 5 assigned to device CPU, is_swa = 0
|
| 98 |
+
load_tensors: layer 6 assigned to device CPU, is_swa = 0
|
| 99 |
+
load_tensors: layer 7 assigned to device CPU, is_swa = 0
|
| 100 |
+
load_tensors: layer 8 assigned to device CPU, is_swa = 0
|
| 101 |
+
load_tensors: layer 9 assigned to device CPU, is_swa = 0
|
| 102 |
+
load_tensors: layer 10 assigned to device CPU, is_swa = 0
|
| 103 |
+
load_tensors: layer 11 assigned to device CPU, is_swa = 0
|
| 104 |
+
load_tensors: layer 12 assigned to device CPU, is_swa = 0
|
| 105 |
+
load_tensors: layer 13 assigned to device CPU, is_swa = 0
|
| 106 |
+
load_tensors: layer 14 assigned to device CPU, is_swa = 0
|
| 107 |
+
load_tensors: layer 15 assigned to device CPU, is_swa = 0
|
| 108 |
+
load_tensors: layer 16 assigned to device CPU, is_swa = 0
|
| 109 |
+
load_tensors: layer 17 assigned to device CPU, is_swa = 0
|
| 110 |
+
load_tensors: layer 18 assigned to device CPU, is_swa = 0
|
| 111 |
+
load_tensors: layer 19 assigned to device CPU, is_swa = 0
|
| 112 |
+
load_tensors: layer 20 assigned to device CPU, is_swa = 0
|
| 113 |
+
load_tensors: layer 21 assigned to device CPU, is_swa = 0
|
| 114 |
+
load_tensors: layer 22 assigned to device CPU, is_swa = 0
|
| 115 |
+
load_tensors: layer 23 assigned to device CPU, is_swa = 0
|
| 116 |
+
load_tensors: layer 24 assigned to device CPU, is_swa = 0
|
| 117 |
+
load_tensors: layer 25 assigned to device CPU, is_swa = 0
|
| 118 |
+
load_tensors: layer 26 assigned to device CPU, is_swa = 0
|
| 119 |
+
load_tensors: layer 27 assigned to device CPU, is_swa = 0
|
| 120 |
+
load_tensors: layer 28 assigned to device CPU, is_swa = 0
|
| 121 |
+
load_tensors: layer 29 assigned to device CPU, is_swa = 0
|
| 122 |
+
load_tensors: layer 30 assigned to device CPU, is_swa = 0
|
| 123 |
+
load_tensors: layer 31 assigned to device CPU, is_swa = 0
|
| 124 |
+
load_tensors: layer 32 assigned to device CPU, is_swa = 0
|
| 125 |
+
load_tensors: tensor 'token_embd.weight' (q4_K) (and 114 others) cannot be used with preferred buffer type CPU_REPACK, using CPU instead
|
| 126 |
+
load_tensors: CPU_REPACK model buffer size = 1242.00 MiB
|
| 127 |
+
load_tensors: CPU_Mapped model buffer size = 2281.66 MiB
|
| 128 |
+
repack: repack tensor blk.0.attn_output.weight with q4_K_8x8
|
| 129 |
+
repack: repack tensor blk.0.ffn_up.weight with q4_K_8x8
|
| 130 |
+
.repack: repack tensor blk.1.attn_output.weight with q4_K_8x8
|
| 131 |
+
repack: repack tensor blk.1.ffn_up.weight with q4_K_8x8
|
| 132 |
+
.repack: repack tensor blk.2.attn_output.weight with q4_K_8x8
|
| 133 |
+
.repack: repack tensor blk.2.ffn_up.weight with q4_K_8x8
|
| 134 |
+
.repack: repack tensor blk.3.attn_output.weight with q4_K_8x8
|
| 135 |
+
repack: repack tensor blk.3.ffn_down.weight with q4_K_8x8
|
| 136 |
+
.repack: repack tensor blk.3.ffn_up.weight with q4_K_8x8
|
| 137 |
+
.repack: repack tensor blk.4.attn_output.weight with q4_K_8x8
|
| 138 |
+
repack: repack tensor blk.4.ffn_up.weight with q4_K_8x8
|
| 139 |
+
.repack: repack tensor blk.5.attn_output.weight with q4_K_8x8
|
| 140 |
+
repack: repack tensor blk.5.ffn_down.weight with q4_K_8x8
|
| 141 |
+
.repack: repack tensor blk.5.ffn_up.weight with q4_K_8x8
|
| 142 |
+
.repack: repack tensor blk.6.attn_output.weight with q4_K_8x8
|
| 143 |
+
repack: repack tensor blk.6.ffn_down.weight with q4_K_8x8
|
| 144 |
+
.repack: repack tensor blk.6.ffn_up.weight with q4_K_8x8
|
| 145 |
+
.repack: repack tensor blk.7.attn_output.weight with q4_K_8x8
|
| 146 |
+
repack: repack tensor blk.7.ffn_up.weight with q4_K_8x8
|
| 147 |
+
.repack: repack tensor blk.8.attn_output.weight with q4_K_8x8
|
| 148 |
+
repack: repack tensor blk.8.ffn_down.weight with q4_K_8x8
|
| 149 |
+
repack: repack tensor blk.8.ffn_up.weight with q4_K_8x8
|
| 150 |
+
.repack: repack tensor blk.9.attn_output.weight with q4_K_8x8
|
| 151 |
+
repack: repack tensor blk.9.ffn_down.weight with q4_K_8x8
|
| 152 |
+
repack: repack tensor blk.9.ffn_up.weight with q4_K_8x8
|
| 153 |
+
.repack: repack tensor blk.10.attn_output.weight with q4_K_8x8
|
| 154 |
+
repack: repack tensor blk.10.ffn_up.weight with q4_K_8x8
|
| 155 |
+
.repack: repack tensor blk.11.attn_output.weight with q4_K_8x8
|
| 156 |
+
repack: repack tensor blk.11.ffn_up.weight with q4_K_8x8
|
| 157 |
+
.repack: repack tensor blk.12.attn_output.weight with q4_K_8x8
|
| 158 |
+
.repack: repack tensor blk.12.ffn_down.weight with q4_K_8x8
|
| 159 |
+
repack: repack tensor blk.12.ffn_up.weight with q4_K_8x8
|
| 160 |
+
.repack: repack tensor blk.13.attn_output.weight with q4_K_8x8
|
| 161 |
+
.repack: repack tensor blk.13.ffn_down.weight with q4_K_8x8
|
| 162 |
+
repack: repack tensor blk.13.ffn_up.weight with q4_K_8x8
|
| 163 |
+
.repack: repack tensor blk.14.attn_output.weight with q4_K_8x8
|
| 164 |
+
.repack: repack tensor blk.14.ffn_up.weight with q4_K_8x8
|
| 165 |
+
.repack: repack tensor blk.15.attn_output.weight with q4_K_8x8
|
| 166 |
+
repack: repack tensor blk.15.ffn_down.weight with q4_K_8x8
|
| 167 |
+
.repack: repack tensor blk.15.ffn_up.weight with q4_K_8x8
|
| 168 |
+
.repack: repack tensor blk.16.attn_output.weight with q4_K_8x8
|
| 169 |
+
repack: repack tensor blk.16.ffn_down.weight with q4_K_8x8
|
| 170 |
+
.repack: repack tensor blk.16.ffn_up.weight with q4_K_8x8
|
| 171 |
+
.repack: repack tensor blk.17.attn_output.weight with q4_K_8x8
|
| 172 |
+
repack: repack tensor blk.17.ffn_up.weight with q4_K_8x8
|
| 173 |
+
.repack: repack tensor blk.18.attn_output.weight with q4_K_8x8
|
| 174 |
+
repack: repack tensor blk.18.ffn_down.weight with q4_K_8x8
|
| 175 |
+
.repack: repack tensor blk.18.ffn_up.weight with q4_K_8x8
|
| 176 |
+
.repack: repack tensor blk.19.attn_output.weight with q4_K_8x8
|
| 177 |
+
repack: repack tensor blk.19.ffn_down.weight with q4_K_8x8
|
| 178 |
+
.repack: repack tensor blk.19.ffn_up.weight with q4_K_8x8
|
| 179 |
+
.repack: repack tensor blk.20.attn_output.weight with q4_K_8x8
|
| 180 |
+
repack: repack tensor blk.20.ffn_down.weight with q4_K_8x8
|
| 181 |
+
.repack: repack tensor blk.20.ffn_up.weight with q4_K_8x8
|
| 182 |
+
.repack: repack tensor blk.21.attn_output.weight with q4_K_8x8
|
| 183 |
+
repack: repack tensor blk.21.ffn_up.weight with q4_K_8x8
|
| 184 |
+
.repack: repack tensor blk.22.attn_output.weight with q4_K_8x8
|
| 185 |
+
repack: repack tensor blk.22.ffn_down.weight with q4_K_8x8
|
| 186 |
+
repack: repack tensor blk.22.ffn_up.weight with q4_K_8x8
|
| 187 |
+
.repack: repack tensor blk.23.attn_output.weight with q4_K_8x8
|
| 188 |
+
repack: repack tensor blk.23.ffn_down.weight with q4_K_8x8
|
| 189 |
+
repack: repack tensor blk.23.ffn_up.weight with q4_K_8x8
|
| 190 |
+
.repack: repack tensor blk.24.attn_output.weight with q4_K_8x8
|
| 191 |
+
repack: repack tensor blk.24.ffn_up.weight with q4_K_8x8
|
| 192 |
+
.repack: repack tensor blk.25.attn_output.weight with q4_K_8x8
|
| 193 |
+
repack: repack tensor blk.25.ffn_down.weight with q4_K_8x8
|
| 194 |
+
.repack: repack tensor blk.25.ffn_up.weight with q4_K_8x8
|
| 195 |
+
.repack: repack tensor blk.26.attn_output.weight with q4_K_8x8
|
| 196 |
+
repack: repack tensor blk.26.ffn_down.weight with q4_K_8x8
|
| 197 |
+
.repack: repack tensor blk.26.ffn_up.weight with q4_K_8x8
|
| 198 |
+
.repack: repack tensor blk.27.attn_output.weight with q4_K_8x8
|
| 199 |
+
repack: repack tensor blk.27.ffn_up.weight with q4_K_8x8
|
| 200 |
+
.repack: repack tensor blk.28.attn_output.weight with q4_K_8x8
|
| 201 |
+
.repack: repack tensor blk.28.ffn_up.weight with q4_K_8x8
|
| 202 |
+
.repack: repack tensor blk.29.attn_output.weight with q4_K_8x8
|
| 203 |
+
repack: repack tensor blk.29.ffn_up.weight with q4_K_8x8
|
| 204 |
+
.repack: repack tensor blk.30.attn_output.weight with q4_K_8x8
|
| 205 |
+
repack: repack tensor blk.30.ffn_up.weight with q4_K_8x8
|
| 206 |
+
.repack: repack tensor blk.31.attn_output.weight with q4_K_8x8
|
| 207 |
+
repack: repack tensor blk.31.ffn_up.weight with q4_K_8x8
|
| 208 |
+
...........................................
|
| 209 |
+
llama_context: constructing llama_context
|
| 210 |
+
llama_context: n_seq_max = 16
|
| 211 |
+
llama_context: n_ctx = 4096
|
| 212 |
+
llama_context: n_ctx_per_seq = 256
|
| 213 |
+
llama_context: n_batch = 512
|
| 214 |
+
llama_context: n_ubatch = 512
|
| 215 |
+
llama_context: causal_attn = 1
|
| 216 |
+
llama_context: flash_attn = 0
|
| 217 |
+
llama_context: kv_unified = false
|
| 218 |
+
llama_context: freq_base = 10000.0
|
| 219 |
+
llama_context: freq_scale = 1
|
| 220 |
+
llama_context: n_ctx_per_seq (256) < n_ctx_train (4096) -- the full capacity of the model will not be utilized
|
| 221 |
+
set_abort_callback: call
|
| 222 |
+
llama_context: CPU output buffer size = 1.96 MiB
|
| 223 |
+
create_memory: n_ctx = 4096 (padded)
|
| 224 |
+
llama_kv_cache_unified: layer 0: dev = CPU
|
| 225 |
+
llama_kv_cache_unified: layer 1: dev = CPU
|
| 226 |
+
llama_kv_cache_unified: layer 2: dev = CPU
|
| 227 |
+
llama_kv_cache_unified: layer 3: dev = CPU
|
| 228 |
+
llama_kv_cache_unified: layer 4: dev = CPU
|
| 229 |
+
llama_kv_cache_unified: layer 5: dev = CPU
|
| 230 |
+
llama_kv_cache_unified: layer 6: dev = CPU
|
| 231 |
+
llama_kv_cache_unified: layer 7: dev = CPU
|
| 232 |
+
llama_kv_cache_unified: layer 8: dev = CPU
|
| 233 |
+
llama_kv_cache_unified: layer 9: dev = CPU
|
| 234 |
+
llama_kv_cache_unified: layer 10: dev = CPU
|
| 235 |
+
llama_kv_cache_unified: layer 11: dev = CPU
|
| 236 |
+
llama_kv_cache_unified: layer 12: dev = CPU
|
| 237 |
+
llama_kv_cache_unified: layer 13: dev = CPU
|
| 238 |
+
llama_kv_cache_unified: layer 14: dev = CPU
|
| 239 |
+
llama_kv_cache_unified: layer 15: dev = CPU
|
| 240 |
+
llama_kv_cache_unified: layer 16: dev = CPU
|
| 241 |
+
llama_kv_cache_unified: layer 17: dev = CPU
|
| 242 |
+
llama_kv_cache_unified: layer 18: dev = CPU
|
| 243 |
+
llama_kv_cache_unified: layer 19: dev = CPU
|
| 244 |
+
llama_kv_cache_unified: layer 20: dev = CPU
|
| 245 |
+
llama_kv_cache_unified: layer 21: dev = CPU
|
| 246 |
+
llama_kv_cache_unified: layer 22: dev = CPU
|
| 247 |
+
llama_kv_cache_unified: layer 23: dev = CPU
|
| 248 |
+
llama_kv_cache_unified: layer 24: dev = CPU
|
| 249 |
+
llama_kv_cache_unified: layer 25: dev = CPU
|
| 250 |
+
llama_kv_cache_unified: layer 26: dev = CPU
|
| 251 |
+
llama_kv_cache_unified: layer 27: dev = CPU
|
| 252 |
+
llama_kv_cache_unified: layer 28: dev = CPU
|
| 253 |
+
llama_kv_cache_unified: layer 29: dev = CPU
|
| 254 |
+
llama_kv_cache_unified: layer 30: dev = CPU
|
| 255 |
+
llama_kv_cache_unified: layer 31: dev = CPU
|
| 256 |
+
llama_kv_cache_unified: CPU KV buffer size = 1536.00 MiB
|
| 257 |
+
llama_kv_cache_unified: size = 1536.00 MiB ( 256 cells, 32 layers, 16/16 seqs), K (f16): 768.00 MiB, V (f16): 768.00 MiB
|
| 258 |
+
llama_context: enumerating backends
|
| 259 |
+
llama_context: backend_ptrs.size() = 1
|
| 260 |
+
llama_context: max_nodes = 1560
|
| 261 |
+
llama_context: worst-case: n_tokens = 512, n_seqs = 16, n_outputs = 0
|
| 262 |
+
graph_reserve: reserving a graph for ubatch with n_tokens = 512, n_seqs = 16, n_outputs = 512
|
| 263 |
+
graph_reserve: reserving a graph for ubatch with n_tokens = 16, n_seqs = 16, n_outputs = 16
|
| 264 |
+
graph_reserve: reserving a graph for ubatch with n_tokens = 512, n_seqs = 16, n_outputs = 512
|
| 265 |
+
llama_context: CPU compute buffer size = 73.01 MiB
|
| 266 |
+
llama_context: graph nodes = 1126
|
| 267 |
+
llama_context: graph splits = 1
|
| 268 |
+
INFO: Started server process [12669]
|
| 269 |
+
INFO: Waiting for application startup.
|
| 270 |
+
INFO: Application startup complete.
|
| 271 |
+
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
| 272 |
+
Batch loop started...
|
| 273 |
+
INFO: 127.0.0.1:59484 - "POST /chat?prompt=Count%20to%2010 HTTP/1.1" 200 OK
|
engine/CMakeLists.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cmake_minimum_required(VERSION 3.10)
|
| 2 |
+
project(llama_batch_engine)
|
| 3 |
+
|
| 4 |
+
set(CMAKE_CXX_STANDARD 17)
|
| 5 |
+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
| 6 |
+
|
| 7 |
+
# Find llama.cpp (assuming it's in the parent directory)
|
| 8 |
+
set(LLAMA_DIR "../../llama.cpp")
|
| 9 |
+
include_directories(${LLAMA_DIR}/include ${LLAMA_DIR}/common ${LLAMA_DIR})
|
| 10 |
+
|
| 11 |
+
# Sources
|
| 12 |
+
set(SOURCES batch_server.cpp)
|
| 13 |
+
|
| 14 |
+
# Build the shared library
|
| 15 |
+
add_library(batch SHARED ${SOURCES})
|
| 16 |
+
|
| 17 |
+
# Note: In a real environment, you'd link against the compiled llama.cpp library
|
| 18 |
+
# For this task, we assume the user has llama.cpp compiled or we link the objects.
|
| 19 |
+
# Here we'll just define the output name.
|
| 20 |
+
set_target_properties(batch PROPERTIES OUTPUT_NAME "batch")
|
engine/batch_server.cpp
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "llama.h"
|
| 2 |
+
#include <vector>
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <cstring>
|
| 5 |
+
#include <cstdio>
|
| 6 |
+
#include <cstdlib>
|
| 7 |
+
|
| 8 |
+
// Global context for the loaded model
|
| 9 |
+
static llama_model* g_model = nullptr;
|
| 10 |
+
static llama_context* g_ctx = nullptr;
|
| 11 |
+
static llama_sampler* g_smpl = nullptr;
|
| 12 |
+
|
| 13 |
+
extern "C" {
|
| 14 |
+
|
| 15 |
+
// Initialize the model
|
| 16 |
+
bool init_model(const char* model_path) {
|
| 17 |
+
llama_backend_init();
|
| 18 |
+
|
| 19 |
+
llama_model_params model_params = llama_model_default_params();
|
| 20 |
+
model_params.n_gpu_layers = 0; // CPU only for now
|
| 21 |
+
g_model = llama_model_load_from_file(model_path, model_params);
|
| 22 |
+
|
| 23 |
+
if (!g_model) return false;
|
| 24 |
+
|
| 25 |
+
llama_context_params ctx_params = llama_context_default_params();
|
| 26 |
+
ctx_params.n_ctx = 4096; // 256 per sequence for 16 users
|
| 27 |
+
ctx_params.n_batch = 512;
|
| 28 |
+
ctx_params.n_threads = 8;
|
| 29 |
+
ctx_params.n_threads_batch = 8;
|
| 30 |
+
ctx_params.n_seq_max = 16;
|
| 31 |
+
g_ctx = llama_init_from_model(g_model, ctx_params);
|
| 32 |
+
|
| 33 |
+
if (!g_ctx) return false;
|
| 34 |
+
|
| 35 |
+
auto sparams = llama_sampler_chain_default_params();
|
| 36 |
+
g_smpl = llama_sampler_chain_init(sparams);
|
| 37 |
+
llama_sampler_chain_add(g_smpl, llama_sampler_init_greedy());
|
| 38 |
+
|
| 39 |
+
return true;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
// Helper to add a token to a batch manually
|
| 43 |
+
void batch_add(llama_batch & batch, llama_token id, llama_pos pos, const std::vector<llama_seq_id> & seq_ids, bool logits) {
|
| 44 |
+
batch.token[batch.n_tokens] = id;
|
| 45 |
+
batch.pos[batch.n_tokens] = pos;
|
| 46 |
+
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
| 47 |
+
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
| 48 |
+
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
| 49 |
+
}
|
| 50 |
+
batch.logits[batch.n_tokens] = logits;
|
| 51 |
+
batch.n_tokens++;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
// Global state for streaming
|
| 55 |
+
static int g_count = 0;
|
| 56 |
+
static int g_step = 0;
|
| 57 |
+
static int g_max_tokens = 0;
|
| 58 |
+
static std::vector<std::string> g_responses;
|
| 59 |
+
static std::vector<bool> g_active;
|
| 60 |
+
static std::vector<int> g_n_pos;
|
| 61 |
+
static std::vector<int> g_logits_idx;
|
| 62 |
+
static std::vector<std::vector<llama_token>> g_all_tokens;
|
| 63 |
+
static llama_batch g_batch;
|
| 64 |
+
static const llama_vocab* g_vocab = nullptr;
|
| 65 |
+
|
| 66 |
+
// 1. Start a new batch (Prefill)
|
| 67 |
+
void start_batch(const char** prompts, int count, int max_tokens) {
|
| 68 |
+
if (!g_ctx || count == 0) return;
|
| 69 |
+
|
| 70 |
+
g_vocab = llama_model_get_vocab(g_model);
|
| 71 |
+
g_count = count;
|
| 72 |
+
g_max_tokens = max_tokens;
|
| 73 |
+
g_step = 0;
|
| 74 |
+
|
| 75 |
+
// Reset state
|
| 76 |
+
g_responses.assign(count, "");
|
| 77 |
+
g_active.assign(count, true);
|
| 78 |
+
g_n_pos.assign(count, 0);
|
| 79 |
+
g_logits_idx.assign(count, -1);
|
| 80 |
+
g_all_tokens.clear();
|
| 81 |
+
|
| 82 |
+
// Tokenize
|
| 83 |
+
for (int i = 0; i < count; i++) {
|
| 84 |
+
int n_prompt = -llama_tokenize(g_vocab, prompts[i], strlen(prompts[i]), NULL, 0, true, true);
|
| 85 |
+
std::vector<llama_token> tokens(n_prompt);
|
| 86 |
+
llama_tokenize(g_vocab, prompts[i], strlen(prompts[i]), tokens.data(), tokens.size(), true, true);
|
| 87 |
+
g_all_tokens.push_back(tokens);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
// Clear KV cache
|
| 91 |
+
llama_memory_clear(llama_get_memory(g_ctx), true);
|
| 92 |
+
|
| 93 |
+
// Init batch
|
| 94 |
+
if (g_batch.token) llama_batch_free(g_batch); // Free if exists
|
| 95 |
+
g_batch = llama_batch_init(4096, 0, 1); // Larger batch for safety
|
| 96 |
+
|
| 97 |
+
// Prefill
|
| 98 |
+
g_batch.n_tokens = 0;
|
| 99 |
+
for (int i = 0; i < count; i++) {
|
| 100 |
+
for (size_t j = 0; j < g_all_tokens[i].size(); j++) {
|
| 101 |
+
bool is_last = (j == g_all_tokens[i].size() - 1);
|
| 102 |
+
if (is_last) g_logits_idx[i] = g_batch.n_tokens;
|
| 103 |
+
batch_add(g_batch, g_all_tokens[i][j], g_n_pos[i]++, { (llama_seq_id)i }, is_last);
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
// Decode Prefill
|
| 108 |
+
if (llama_decode(g_ctx, g_batch)) {
|
| 109 |
+
fprintf(stderr, "Failed to decode prefill\n");
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
// 2. Decode one step (Generate next token for all active sequences)
|
| 114 |
+
// Returns true if any sequence is still active, false if all done
|
| 115 |
+
bool decode_step(const char** results) {
|
| 116 |
+
if (g_step >= g_max_tokens) return false;
|
| 117 |
+
|
| 118 |
+
g_batch.n_tokens = 0;
|
| 119 |
+
bool any_active = false;
|
| 120 |
+
std::vector<int> next_logits_idx(g_count, -1);
|
| 121 |
+
int current_batch_pos = 0;
|
| 122 |
+
|
| 123 |
+
for (int i = 0; i < g_count; i++) {
|
| 124 |
+
results[i] = nullptr; // Default to null (no new token this step if inactive)
|
| 125 |
+
|
| 126 |
+
if (!g_active[i]) continue;
|
| 127 |
+
|
| 128 |
+
// Sample
|
| 129 |
+
llama_token id = llama_sampler_sample(g_smpl, g_ctx, g_logits_idx[i]);
|
| 130 |
+
llama_sampler_accept(g_smpl, id);
|
| 131 |
+
|
| 132 |
+
// Check EOG/limit
|
| 133 |
+
if (llama_vocab_is_eog(g_vocab, id) || g_n_pos[i] >= 4096) { // Hard limit matches n_ctx
|
| 134 |
+
g_active[i] = false;
|
| 135 |
+
continue;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
// Decode token to string
|
| 139 |
+
static char buf[256]; // Static buffer for simplicity (not thread safe across batches, but fine here)
|
| 140 |
+
int n = llama_token_to_piece(g_vocab, id, buf, sizeof(buf), 0, true);
|
| 141 |
+
if (n < 0) {
|
| 142 |
+
// error or empty
|
| 143 |
+
} else {
|
| 144 |
+
// Allocate new string for result to pass back to Python
|
| 145 |
+
// Python side must handle this appropriately or we just copy
|
| 146 |
+
// For ctypes `const char**`, we need a pointer that persists until next call
|
| 147 |
+
// We can use a thread_local buffer or just strdup and let Python manage?
|
| 148 |
+
// Ideally Python frees, but to keep it simple we use a persistent buffer in C++?
|
| 149 |
+
// No, let's just return a pointer to a static buffer is risky if multiple threads.
|
| 150 |
+
// But we are single threaded engine key.
|
| 151 |
+
// Safer: return strdup, Python bridge won't free it effortlessly though.
|
| 152 |
+
// Let's use a global vector of string buffers for the current step.
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
// Actually, we just need to return the piece.
|
| 156 |
+
// Let's rely on a persistent buffer for the current step.
|
| 157 |
+
// We can just strdup and leak if we don't free? No.
|
| 158 |
+
// Let's assume the Python side consumes it immediately.
|
| 159 |
+
results[i] = strdup(buf);
|
| 160 |
+
|
| 161 |
+
next_logits_idx[i] = current_batch_pos++;
|
| 162 |
+
batch_add(g_batch, id, g_n_pos[i]++, { (llama_seq_id)i }, true);
|
| 163 |
+
any_active = true;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
if (!any_active) return false;
|
| 167 |
+
|
| 168 |
+
g_logits_idx = next_logits_idx;
|
| 169 |
+
if (llama_decode(g_ctx, g_batch)) {
|
| 170 |
+
return false;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
g_step++;
|
| 174 |
+
return true;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
// Cleanup
|
| 178 |
+
void cleanup() {
|
| 179 |
+
if (g_smpl) llama_sampler_free(g_smpl);
|
| 180 |
+
if (g_ctx) llama_free(g_ctx);
|
| 181 |
+
if (g_model) llama_model_free(g_model);
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
}
|
engine/libbatch.so
ADDED
|
Binary file (32.6 kB). View file
|
|
|
model/config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "phi3",
|
| 3 |
+
"architecture": "phi",
|
| 4 |
+
"context_length": 4096,
|
| 5 |
+
"quantization": "Q4_K_M"
|
| 6 |
+
}
|
model/template.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<|system|>
|
| 2 |
+
You are a helpful AI assistant.
|
| 3 |
+
<|end|>
|
| 4 |
+
<|user|>
|
| 5 |
+
{{prompt}}
|
| 6 |
+
<|end|>
|
| 7 |
+
<|assistant|>
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
llama-cpp-python
|
setup_model.sh
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
# Default URL (Phi-3 Mini 4k Instruct)
|
| 5 |
+
DEFAULT_URL="https://huggingface.co/prithivMLmods/Nanbeige4.1-3B-f32-GGUF?show_file_info=Nanbeige4.1-3B.Q8_0.gguf"
|
| 6 |
+
MODEL_URL=${1:-$DEFAULT_URL}
|
| 7 |
+
MODEL_DIR="model"
|
| 8 |
+
ENGINE_DIR="engine"
|
| 9 |
+
|
| 10 |
+
echo ">>> Setting up Ultra-Fast LLM..."
|
| 11 |
+
|
| 12 |
+
# 1. Setup Model Directory
|
| 13 |
+
if [ ! -d "$MODEL_DIR" ]; then
|
| 14 |
+
mkdir -p "$MODEL_DIR"
|
| 15 |
+
fi
|
| 16 |
+
|
| 17 |
+
# 2. Download Model
|
| 18 |
+
if [ ! -f "$MODEL_DIR/model.gguf" ]; then
|
| 19 |
+
echo ">>> Downloading model from $MODEL_URL..."
|
| 20 |
+
wget -O "$MODEL_DIR/model.gguf" "$MODEL_URL" --show-progress
|
| 21 |
+
else
|
| 22 |
+
echo ">>> Model already exists using existing..."
|
| 23 |
+
fi
|
| 24 |
+
|
| 25 |
+
# 3. Create Template (Phi-3 default)
|
| 26 |
+
if [ ! -f "$MODEL_DIR/template.txt" ]; then
|
| 27 |
+
echo "<|user|>\n{{prompt}}<|end|>\n<|assistant|>" > "$MODEL_DIR/template.txt"
|
| 28 |
+
echo ">>> Created default Phi-3 template."
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
# 4. Check for Virtual Environment
|
| 32 |
+
if [ ! -d ".venv" ]; then
|
| 33 |
+
echo ">>> Creating Python virtual environment..."
|
| 34 |
+
python3 -m venv .venv
|
| 35 |
+
source .venv/bin/activate
|
| 36 |
+
pip install -r requirements.txt
|
| 37 |
+
else
|
| 38 |
+
source .venv/bin/activate
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# 5. Compile Engine
|
| 42 |
+
echo ">>> Compiling C++ Optimization Engine..."
|
| 43 |
+
if [ -d "$ENGINE_DIR" ]; then
|
| 44 |
+
cd "$ENGINE_DIR"
|
| 45 |
+
g++ -O2 -march=native -shared -fPIC -o libbatch.so batch_server.cpp \
|
| 46 |
+
-I"../.venv/lib/python3.12/site-packages/include" \
|
| 47 |
+
-L"../.venv/lib/python3.12/site-packages/llama_cpp/lib" \
|
| 48 |
+
-lllama -Wl,-rpath,"../.venv/lib/python3.12/site-packages/llama_cpp/lib"
|
| 49 |
+
cd ..
|
| 50 |
+
else
|
| 51 |
+
echo "Error: engine directory not found!"
|
| 52 |
+
exit 1
|
| 53 |
+
fi
|
| 54 |
+
|
| 55 |
+
echo ">>> Setup Complete!"
|
| 56 |
+
echo "Run server with: cd api && ../.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000"
|