Spaces:

Karan6933
/

abcd

Build error

App Files Files Community

Karan6933 commited on 5 days ago

Commit

8d11530

verified ·

1 Parent(s): 7ddf91c

Upload 17 files

Browse files

Files changed (17) hide show

Dockerfile +41 -0
api/__pycache__/batcher.cpython-312.pyc +0 -0
api/__pycache__/bridge.cpython-312.pyc +0 -0
api/__pycache__/engine.cpython-312.pyc +0 -0
api/__pycache__/main.cpython-312.pyc +0 -0
api/__pycache__/server.cpython-312.pyc +0 -0
api/batcher.py +29 -0
api/bridge.py +73 -0
api/main.py +81 -0
api/server_logs.txt +273 -0
engine/CMakeLists.txt +20 -0
engine/batch_server.cpp +184 -0
engine/libbatch.so +0 -0
model/config.json +6 -0
model/template.txt +7 -0
requirements.txt +3 -0
setup_model.sh +56 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+# Use a lightweight Python base
+FROM python:3.12-slim
+# Install build tools
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Copy requirements first for cache
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy project files
+COPY . .
+# Setup environment variables for compilation
+# We need to find where pip installed llama-cpp-python to link against it
+# In docker, it's usually /usr/local/lib/python3.12/site-packages
+ENV SITE_PACKAGES=/usr/local/lib/python3.12/site-packages
+# Compile the engine
+WORKDIR /app/engine
+RUN g++ -O2 -shared -fPIC -o libbatch.so batch_server.cpp \
+    -I"${SITE_PACKAGES}/include" \
+    -L"${SITE_PACKAGES}/llama_cpp/lib" \
+    -lllama -Wl,-rpath,"${SITE_PACKAGES}/llama_cpp/lib"
+# Setup Model (Download during build or mount volume?
+# Best practice: Download in build if small, or use script at runtime.
+# Here we'll rely on the user mounting the model or running the setup script.
+# But for "Tunnel Code Optimized", let's assume valid model is present or downloaded.
+# We'll expose the setup script.)
+WORKDIR /app/api
+EXPOSE 8000
+# Start command
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

api/__pycache__/batcher.cpython-312.pyc ADDED Viewed

Binary file (2.05 kB). View file

api/__pycache__/bridge.cpython-312.pyc ADDED Viewed

Binary file (3.03 kB). View file

api/__pycache__/engine.cpython-312.pyc ADDED Viewed

Binary file (1.78 kB). View file

api/__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (3.32 kB). View file

api/__pycache__/server.cpython-312.pyc ADDED Viewed

Binary file (2.06 kB). View file

api/batcher.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import asyncio
+class BatchScheduler:
+    def __init__(self, max_batch=8, max_wait_ms=30):
+        self.queue = []
+        self.max_batch = max_batch
+        self.max_wait_ms = max_wait_ms
+        self.lock = asyncio.Lock()
+    async def add(self, prompt: str):
+        # Create a queue for streaming tokens
+        queue = asyncio.Queue()
+        async with self.lock:
+            self.queue.append((prompt, queue))
+        return queue
+    async def get_batch(self):
+        if not self.queue:
+            return None
+        # Artificial wait to accumulate requests
+        await asyncio.sleep(self.max_wait_ms / 1000)
+        async with self.lock:
+            # Take up to max_batch items from the queue
+            batch = self.queue[:self.max_batch]
+            self.queue = self.queue[self.max_batch:]
+        return batch if batch else None

api/bridge.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import ctypes
+import os
+# Load the shared library
+LIB_PATH = os.path.abspath("../engine/libbatch.so")
+if not os.path.exists(LIB_PATH):
+    raise FileNotFoundError(f"Shared library not found at: {LIB_PATH}. Did you compile the engine?")
+lib = ctypes.CDLL(LIB_PATH)
+# Define function signatures
+lib.init_model.argtypes = [ctypes.c_char_p]
+lib.init_model.restype = ctypes.c_bool
+# Define function signatures for streaming
+lib.start_batch.argtypes = [
+    ctypes.POINTER(ctypes.c_char_p), # prompts
+    ctypes.c_int,                   # count
+    ctypes.c_int                    # max_tokens
+]
+lib.start_batch.restype = None
+lib.decode_step.argtypes = [
+    ctypes.POINTER(ctypes.c_char_p)  # results
+]
+lib.decode_step.restype = ctypes.c_bool
+# Load template
+with open("../model/template.txt", "r") as f:
+    TEMPLATE = f.read()
+def format_prompt(prompt: str) -> str:
+    return TEMPLATE.replace("{{prompt}}", prompt)
+# Initialize the model
+MODEL_PATH = os.path.abspath("../model/model.gguf").encode('utf-8')
+if not lib.init_model(MODEL_PATH):
+    print(f"Failed to initialize model at {MODEL_PATH}")
+def stream_batch(prompts):
+    count = len(prompts)
+    # Apply Ollama-style templates
+    formatted_prompts = [format_prompt(p) for p in prompts]
+    c_prompts = (ctypes.c_char_p * count)(*[p.encode('utf-8') for p in formatted_prompts])
+    c_results = (ctypes.c_char_p * count)()
+    # 1. Start Batch (Prefill)
+    lib.start_batch(c_prompts, count, 256)
+    # 2. Decode Loop
+    while True:
+        # Run one step
+        active = lib.decode_step(c_results)
+        # Collect results for this step
+        step_output = []
+        for i in range(count):
+            res = c_results[i]
+            if res:
+                text = res.decode('utf-8')
+                step_output.append(text)
+                # libc.free(res) # Ideally free, but for now we rely on OS cleanup or leak small amount in this demo
+            else:
+                step_output.append(None)
+        yield step_output
+        if not active:
+            break

api/main.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+from batcher import BatchScheduler
+from bridge import stream_batch
+import asyncio
+import time
+app = FastAPI()
+scheduler = BatchScheduler(max_batch=8, max_wait_ms=30)
+# In-memory chat history (per process, for demo)
+chat_histories = {}
+@app.post("/chat")
+async def chat(prompt: str, session_id: str = "default"):
+    # Simple history management
+    if session_id not in chat_histories:
+        chat_histories[session_id] = []
+    # Contextual prompt construction
+    history = "\n".join(chat_histories[session_id])
+    if history:
+        full_prompt = f"{history}\n{prompt}"
+    else:
+        full_prompt = prompt
+    # Get the queue for this request
+    token_queue = await scheduler.add(full_prompt)
+    # Generator to yield tokens from the queue
+    async def response_generator():
+        full_response = []
+        while True:
+            token = await token_queue.get()
+            if token is None:
+                break
+            yield token
+            full_response.append(token)
+        # After streaming is done, update history
+        # Note: This runs after the response closes, might need background task if strict
+        # But for generator, code continues after yield
+        response_text = "".join(full_response)
+        chat_histories[session_id].append(f"User: {prompt}")
+        chat_histories[session_id].append(f"AI: {response_text}")
+        # Keep history concise
+        if len(chat_histories[session_id]) > 10:
+            chat_histories[session_id] = chat_histories[session_id][-10:]
+    return StreamingResponse(response_generator(), media_type="text/plain")
+async def batch_loop():
+    print("Batch loop started...")
+    while True:
+        # Wait for a batch
+        batch = await scheduler.get_batch()
+        if not batch:
+            await asyncio.sleep(0.01) # Short sleep if empty
+            continue
+        # Process batch
+        prompts, queues = zip(*batch)
+        print(f"Processing batch of {len(prompts)} prompts")
+        # Stream from C++ engine
+        # Iterate over the generator which yields step-by-step tokens
+        for step_tokens in stream_batch(prompts):
+            for q, token in zip(queues, step_tokens):
+                if token is not None:
+                    q.put_nowait(token)
+            # Yield control to event loop to let FastAPI flush tokens
+            await asyncio.sleep(0)
+        # Signal done
+        for q in queues:
+            q.put_nowait(None)
+@app.on_event("startup")
+async def startup_event():
+    asyncio.create_task(batch_loop())

api/server_logs.txt ADDED Viewed

	@@ -0,0 +1,273 @@

+llama_model_loader: loaded meta data with 24 key-value pairs and 195 tensors from /home/karanpc/Desktop/Ollama Copy/ultra-fast-llm-pro/model/model.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = phi3
+llama_model_loader: - kv   1:                               general.name str              = Phi3
+llama_model_loader: - kv   2:                        phi3.context_length u32              = 4096
+llama_model_loader: - kv   3:                      phi3.embedding_length u32              = 3072
+llama_model_loader: - kv   4:                   phi3.feed_forward_length u32              = 8192
+llama_model_loader: - kv   5:                           phi3.block_count u32              = 32
+llama_model_loader: - kv   6:                  phi3.attention.head_count u32              = 32
+llama_model_loader: - kv   7:               phi3.attention.head_count_kv u32              = 32
+llama_model_loader: - kv   8:      phi3.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv   9:                  phi3.rope.dimension_count u32              = 96
+llama_model_loader: - kv  10:                          general.file_type u32              = 15
+llama_model_loader: - kv  11:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  12:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32064]   = ["<unk>", "<s>", "</s>", "<0x00>", "<...
+llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32064]   = [0.000000, 0.000000, 0.000000, 0.0000...
+llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32064]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 32000
+llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  19:            tokenizer.ggml.padding_token_id u32              = 32000
+llama_model_loader: - kv  20:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  21:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  22:                    tokenizer.chat_template str              = {{ bos_token }}{% for message in mess...
+llama_model_loader: - kv  23:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   65 tensors
+llama_model_loader: - type q4_K:   81 tensors
+llama_model_loader: - type q5_K:   32 tensors
+llama_model_loader: - type q6_K:   17 tensors
+print_info: file format = GGUF V3 (latest)
+print_info: file type   = Q4_K - Medium
+print_info: file size   = 2.23 GiB (5.01 BPW)
+init_tokenizer: initializing tokenizer for type 1
+load: control-looking token:  32007 '<|end|>' was not control-type; this is probably a bug in the model. its type will be overridden
+load: control-looking token:  32000 '<|endoftext|>' was not control-type; this is probably a bug in the model. its type will be overridden
+load: control token:      2 '</s>' is not marked as EOG
+load: control token:      1 '<s>' is not marked as EOG
+load: printing all EOG tokens:
+load:   - 32000 ('<|endoftext|>')
+load:   - 32007 ('<|end|>')
+load: special tokens cache size = 67
+load: token to piece cache size = 0.1690 MB
+print_info: arch             = phi3
+print_info: vocab_only       = 0
+print_info: n_ctx_train      = 4096
+print_info: n_embd           = 3072
+print_info: n_layer          = 32
+print_info: n_head           = 32
+print_info: n_head_kv        = 32
+print_info: n_rot            = 96
+print_info: n_swa            = 0
+print_info: is_swa_any       = 0
+print_info: n_embd_head_k    = 96
+print_info: n_embd_head_v    = 96
+print_info: n_gqa            = 1
+print_info: n_embd_k_gqa     = 3072
+print_info: n_embd_v_gqa     = 3072
+print_info: f_norm_eps       = 0.0e+00
+print_info: f_norm_rms_eps   = 1.0e-05
+print_info: f_clamp_kqv      = 0.0e+00
+print_info: f_max_alibi_bias = 0.0e+00
+print_info: f_logit_scale    = 0.0e+00
+print_info: f_attn_scale     = 0.0e+00
+print_info: n_ff             = 8192
+print_info: n_expert         = 0
+print_info: n_expert_used    = 0
+print_info: causal attn      = 1
+print_info: pooling type     = 0
+print_info: rope type        = 2
+print_info: rope scaling     = linear
+print_info: freq_base_train  = 10000.0
+print_info: freq_scale_train = 1
+print_info: n_ctx_orig_yarn  = 4096
+print_info: rope_finetuned   = unknown
+print_info: model type       = 3B
+print_info: model params     = 3.82 B
+print_info: general.name     = Phi3
+print_info: vocab type       = SPM
+print_info: n_vocab          = 32064
+print_info: n_merges         = 0
+print_info: BOS token        = 1 '<s>'
+print_info: EOS token        = 32000 '<|endoftext|>'
+print_info: EOT token        = 32007 '<|end|>'
+print_info: UNK token        = 0 '<unk>'
+print_info: PAD token        = 32000 '<|endoftext|>'
+print_info: LF token         = 13 '<0x0A>'
+print_info: EOG token        = 32000 '<|endoftext|>'
+print_info: EOG token        = 32007 '<|end|>'
+print_info: max token length = 48
+load_tensors: loading model tensors, this can take a while... (mmap = true)
+load_tensors: layer   0 assigned to device CPU, is_swa = 0
+load_tensors: layer   1 assigned to device CPU, is_swa = 0
+load_tensors: layer   2 assigned to device CPU, is_swa = 0
+load_tensors: layer   3 assigned to device CPU, is_swa = 0
+load_tensors: layer   4 assigned to device CPU, is_swa = 0
+load_tensors: layer   5 assigned to device CPU, is_swa = 0
+load_tensors: layer   6 assigned to device CPU, is_swa = 0
+load_tensors: layer   7 assigned to device CPU, is_swa = 0
+load_tensors: layer   8 assigned to device CPU, is_swa = 0
+load_tensors: layer   9 assigned to device CPU, is_swa = 0
+load_tensors: layer  10 assigned to device CPU, is_swa = 0
+load_tensors: layer  11 assigned to device CPU, is_swa = 0
+load_tensors: layer  12 assigned to device CPU, is_swa = 0
+load_tensors: layer  13 assigned to device CPU, is_swa = 0
+load_tensors: layer  14 assigned to device CPU, is_swa = 0
+load_tensors: layer  15 assigned to device CPU, is_swa = 0
+load_tensors: layer  16 assigned to device CPU, is_swa = 0
+load_tensors: layer  17 assigned to device CPU, is_swa = 0
+load_tensors: layer  18 assigned to device CPU, is_swa = 0
+load_tensors: layer  19 assigned to device CPU, is_swa = 0
+load_tensors: layer  20 assigned to device CPU, is_swa = 0
+load_tensors: layer  21 assigned to device CPU, is_swa = 0
+load_tensors: layer  22 assigned to device CPU, is_swa = 0
+load_tensors: layer  23 assigned to device CPU, is_swa = 0
+load_tensors: layer  24 assigned to device CPU, is_swa = 0
+load_tensors: layer  25 assigned to device CPU, is_swa = 0
+load_tensors: layer  26 assigned to device CPU, is_swa = 0
+load_tensors: layer  27 assigned to device CPU, is_swa = 0
+load_tensors: layer  28 assigned to device CPU, is_swa = 0
+load_tensors: layer  29 assigned to device CPU, is_swa = 0
+load_tensors: layer  30 assigned to device CPU, is_swa = 0
+load_tensors: layer  31 assigned to device CPU, is_swa = 0
+load_tensors: layer  32 assigned to device CPU, is_swa = 0
+load_tensors: tensor 'token_embd.weight' (q4_K) (and 114 others) cannot be used with preferred buffer type CPU_REPACK, using CPU instead
+load_tensors:   CPU_REPACK model buffer size =  1242.00 MiB
+load_tensors:   CPU_Mapped model buffer size =  2281.66 MiB
+repack: repack tensor blk.0.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.0.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.1.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.1.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.2.attn_output.weight with q4_K_8x8
+.repack: repack tensor blk.2.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.3.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.3.ffn_down.weight with q4_K_8x8
+.repack: repack tensor blk.3.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.4.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.4.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.5.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.5.ffn_down.weight with q4_K_8x8
+.repack: repack tensor blk.5.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.6.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.6.ffn_down.weight with q4_K_8x8
+.repack: repack tensor blk.6.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.7.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.7.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.8.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.8.ffn_down.weight with q4_K_8x8
+repack: repack tensor blk.8.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.9.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.9.ffn_down.weight with q4_K_8x8
+repack: repack tensor blk.9.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.10.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.10.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.11.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.11.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.12.attn_output.weight with q4_K_8x8
+.repack: repack tensor blk.12.ffn_down.weight with q4_K_8x8
+repack: repack tensor blk.12.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.13.attn_output.weight with q4_K_8x8
+.repack: repack tensor blk.13.ffn_down.weight with q4_K_8x8
+repack: repack tensor blk.13.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.14.attn_output.weight with q4_K_8x8
+.repack: repack tensor blk.14.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.15.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.15.ffn_down.weight with q4_K_8x8
+.repack: repack tensor blk.15.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.16.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.16.ffn_down.weight with q4_K_8x8
+.repack: repack tensor blk.16.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.17.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.17.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.18.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.18.ffn_down.weight with q4_K_8x8
+.repack: repack tensor blk.18.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.19.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.19.ffn_down.weight with q4_K_8x8
+.repack: repack tensor blk.19.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.20.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.20.ffn_down.weight with q4_K_8x8
+.repack: repack tensor blk.20.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.21.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.21.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.22.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.22.ffn_down.weight with q4_K_8x8
+repack: repack tensor blk.22.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.23.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.23.ffn_down.weight with q4_K_8x8
+repack: repack tensor blk.23.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.24.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.24.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.25.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.25.ffn_down.weight with q4_K_8x8
+.repack: repack tensor blk.25.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.26.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.26.ffn_down.weight with q4_K_8x8
+.repack: repack tensor blk.26.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.27.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.27.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.28.attn_output.weight with q4_K_8x8
+.repack: repack tensor blk.28.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.29.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.29.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.30.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.30.ffn_up.weight with q4_K_8x8
+.repack: repack tensor blk.31.attn_output.weight with q4_K_8x8
+repack: repack tensor blk.31.ffn_up.weight with q4_K_8x8
+...........................................
+llama_context: constructing llama_context
+llama_context: n_seq_max     = 16
+llama_context: n_ctx         = 4096
+llama_context: n_ctx_per_seq = 256
+llama_context: n_batch       = 512
+llama_context: n_ubatch      = 512
+llama_context: causal_attn   = 1
+llama_context: flash_attn    = 0
+llama_context: kv_unified    = false
+llama_context: freq_base     = 10000.0
+llama_context: freq_scale    = 1
+llama_context: n_ctx_per_seq (256) < n_ctx_train (4096) -- the full capacity of the model will not be utilized
+set_abort_callback: call
+llama_context:        CPU  output buffer size =     1.96 MiB
+create_memory: n_ctx = 4096 (padded)
+llama_kv_cache_unified: layer   0: dev = CPU
+llama_kv_cache_unified: layer   1: dev = CPU
+llama_kv_cache_unified: layer   2: dev = CPU
+llama_kv_cache_unified: layer   3: dev = CPU
+llama_kv_cache_unified: layer   4: dev = CPU
+llama_kv_cache_unified: layer   5: dev = CPU
+llama_kv_cache_unified: layer   6: dev = CPU
+llama_kv_cache_unified: layer   7: dev = CPU
+llama_kv_cache_unified: layer   8: dev = CPU
+llama_kv_cache_unified: layer   9: dev = CPU
+llama_kv_cache_unified: layer  10: dev = CPU
+llama_kv_cache_unified: layer  11: dev = CPU
+llama_kv_cache_unified: layer  12: dev = CPU
+llama_kv_cache_unified: layer  13: dev = CPU
+llama_kv_cache_unified: layer  14: dev = CPU
+llama_kv_cache_unified: layer  15: dev = CPU
+llama_kv_cache_unified: layer  16: dev = CPU
+llama_kv_cache_unified: layer  17: dev = CPU
+llama_kv_cache_unified: layer  18: dev = CPU
+llama_kv_cache_unified: layer  19: dev = CPU
+llama_kv_cache_unified: layer  20: dev = CPU
+llama_kv_cache_unified: layer  21: dev = CPU
+llama_kv_cache_unified: layer  22: dev = CPU
+llama_kv_cache_unified: layer  23: dev = CPU
+llama_kv_cache_unified: layer  24: dev = CPU
+llama_kv_cache_unified: layer  25: dev = CPU
+llama_kv_cache_unified: layer  26: dev = CPU
+llama_kv_cache_unified: layer  27: dev = CPU
+llama_kv_cache_unified: layer  28: dev = CPU
+llama_kv_cache_unified: layer  29: dev = CPU
+llama_kv_cache_unified: layer  30: dev = CPU
+llama_kv_cache_unified: layer  31: dev = CPU
+llama_kv_cache_unified:        CPU KV buffer size =  1536.00 MiB
+llama_kv_cache_unified: size = 1536.00 MiB (   256 cells,  32 layers, 16/16 seqs), K (f16):  768.00 MiB, V (f16):  768.00 MiB
+llama_context: enumerating backends
+llama_context: backend_ptrs.size() = 1
+llama_context: max_nodes = 1560
+llama_context: worst-case: n_tokens = 512, n_seqs = 16, n_outputs = 0
+graph_reserve: reserving a graph for ubatch with n_tokens =  512, n_seqs = 16, n_outputs =  512
+graph_reserve: reserving a graph for ubatch with n_tokens =   16, n_seqs = 16, n_outputs =   16
+graph_reserve: reserving a graph for ubatch with n_tokens =  512, n_seqs = 16, n_outputs =  512
+llama_context:        CPU compute buffer size =    73.01 MiB
+llama_context: graph nodes  = 1126
+llama_context: graph splits = 1
+INFO:     Started server process [12669]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+Batch loop started...
+INFO:     127.0.0.1:59484 - "POST /chat?prompt=Count%20to%2010 HTTP/1.1" 200 OK

engine/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+cmake_minimum_required(VERSION 3.10)
+project(llama_batch_engine)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+# Find llama.cpp (assuming it's in the parent directory)
+set(LLAMA_DIR "../../llama.cpp")
+include_directories(${LLAMA_DIR}/include ${LLAMA_DIR}/common ${LLAMA_DIR})
+# Sources
+set(SOURCES batch_server.cpp)
+# Build the shared library
+add_library(batch SHARED ${SOURCES})
+# Note: In a real environment, you'd link against the compiled llama.cpp library
+# For this task, we assume the user has llama.cpp compiled or we link the objects.
+# Here we'll just define the output name.
+set_target_properties(batch PROPERTIES OUTPUT_NAME "batch")

engine/batch_server.cpp ADDED Viewed

	@@ -0,0 +1,184 @@

+#include "llama.h"
+#include <vector>
+#include <string>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+// Global context for the loaded model
+static llama_model* g_model = nullptr;
+static llama_context* g_ctx = nullptr;
+static llama_sampler* g_smpl = nullptr;
+extern "C" {
+// Initialize the model
+bool init_model(const char* model_path) {
+    llama_backend_init();
+    llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers = 0; // CPU only for now
+    g_model = llama_model_load_from_file(model_path, model_params);
+    if (!g_model) return false;
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx = 4096; // 256 per sequence for 16 users
+    ctx_params.n_batch = 512;
+    ctx_params.n_threads = 8;
+    ctx_params.n_threads_batch = 8;
+    ctx_params.n_seq_max = 16;
+    g_ctx = llama_init_from_model(g_model, ctx_params);
+    if (!g_ctx) return false;
+    auto sparams = llama_sampler_chain_default_params();
+    g_smpl = llama_sampler_chain_init(sparams);
+    llama_sampler_chain_add(g_smpl, llama_sampler_init_greedy());
+    return true;
+}
+// Helper to add a token to a batch manually
+void batch_add(llama_batch & batch, llama_token id, llama_pos pos, const std::vector<llama_seq_id> & seq_ids, bool logits) {
+    batch.token[batch.n_tokens] = id;
+    batch.pos[batch.n_tokens]   = pos;
+    batch.n_seq_id[batch.n_tokens] = seq_ids.size();
+    for (size_t i = 0; i < seq_ids.size(); ++i) {
+        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
+    }
+    batch.logits[batch.n_tokens] = logits;
+    batch.n_tokens++;
+}
+// Global state for streaming
+static int g_count = 0;
+static int g_step = 0;
+static int g_max_tokens = 0;
+static std::vector<std::string> g_responses;
+static std::vector<bool> g_active;
+static std::vector<int> g_n_pos;
+static std::vector<int> g_logits_idx;
+static std::vector<std::vector<llama_token>> g_all_tokens;
+static llama_batch g_batch;
+static const llama_vocab* g_vocab = nullptr;
+// 1. Start a new batch (Prefill)
+void start_batch(const char** prompts, int count, int max_tokens) {
+    if (!g_ctx || count == 0) return;
+    g_vocab = llama_model_get_vocab(g_model);
+    g_count = count;
+    g_max_tokens = max_tokens;
+    g_step = 0;
+    // Reset state
+    g_responses.assign(count, "");
+    g_active.assign(count, true);
+    g_n_pos.assign(count, 0);
+    g_logits_idx.assign(count, -1);
+    g_all_tokens.clear();
+    // Tokenize
+    for (int i = 0; i < count; i++) {
+        int n_prompt = -llama_tokenize(g_vocab, prompts[i], strlen(prompts[i]), NULL, 0, true, true);
+        std::vector<llama_token> tokens(n_prompt);
+        llama_tokenize(g_vocab, prompts[i], strlen(prompts[i]), tokens.data(), tokens.size(), true, true);
+        g_all_tokens.push_back(tokens);
+    }
+    // Clear KV cache
+    llama_memory_clear(llama_get_memory(g_ctx), true);
+    // Init batch
+    if (g_batch.token) llama_batch_free(g_batch); // Free if exists
+    g_batch = llama_batch_init(4096, 0, 1); // Larger batch for safety
+    // Prefill
+    g_batch.n_tokens = 0;
+    for (int i = 0; i < count; i++) {
+        for (size_t j = 0; j < g_all_tokens[i].size(); j++) {
+            bool is_last = (j == g_all_tokens[i].size() - 1);
+            if (is_last) g_logits_idx[i] = g_batch.n_tokens;
+            batch_add(g_batch, g_all_tokens[i][j], g_n_pos[i]++, { (llama_seq_id)i }, is_last);
+        }
+    }
+    // Decode Prefill
+    if (llama_decode(g_ctx, g_batch)) {
+        fprintf(stderr, "Failed to decode prefill\n");
+    }
+}
+// 2. Decode one step (Generate next token for all active sequences)
+// Returns true if any sequence is still active, false if all done
+bool decode_step(const char** results) {
+    if (g_step >= g_max_tokens) return false;
+    g_batch.n_tokens = 0;
+    bool any_active = false;
+    std::vector<int> next_logits_idx(g_count, -1);
+    int current_batch_pos = 0;
+    for (int i = 0; i < g_count; i++) {
+        results[i] = nullptr; // Default to null (no new token this step if inactive)
+        if (!g_active[i]) continue;
+        // Sample
+        llama_token id = llama_sampler_sample(g_smpl, g_ctx, g_logits_idx[i]);
+        llama_sampler_accept(g_smpl, id);
+        // Check EOG/limit
+        if (llama_vocab_is_eog(g_vocab, id) || g_n_pos[i] >= 4096) { // Hard limit matches n_ctx
+            g_active[i] = false;
+            continue;
+        }
+        // Decode token to string
+        static char buf[256]; // Static buffer for simplicity (not thread safe across batches, but fine here)
+        int n = llama_token_to_piece(g_vocab, id, buf, sizeof(buf), 0, true);
+        if (n < 0) {
+            // error or empty
+        } else {
+             // Allocate new string for result to pass back to Python
+             // Python side must handle this appropriately or we just copy
+             // For ctypes `const char**`, we need a pointer that persists until next call
+             // We can use a thread_local buffer or just strdup and let Python manage?
+             // Ideally Python frees, but to keep it simple we use a persistent buffer in C++?
+             // No, let's just return a pointer to a static buffer is risky if multiple threads.
+             // But we are single threaded engine key.
+             // Safer: return strdup, Python bridge won't free it effortlessly though.
+             // Let's use a global vector of string buffers for the current step.
+        }
+        // Actually, we just need to return the piece.
+        // Let's rely on a persistent buffer for the current step.
+        // We can just strdup and leak if we don't free? No.
+        // Let's assume the Python side consumes it immediately.
+        results[i] = strdup(buf);
+        next_logits_idx[i] = current_batch_pos++;
+        batch_add(g_batch, id, g_n_pos[i]++, { (llama_seq_id)i }, true);
+        any_active = true;
+    }
+    if (!any_active) return false;
+    g_logits_idx = next_logits_idx;
+    if (llama_decode(g_ctx, g_batch)) {
+         return false;
+    }
+    g_step++;
+    return true;
+}
+// Cleanup
+void cleanup() {
+    if (g_smpl) llama_sampler_free(g_smpl);
+    if (g_ctx) llama_free(g_ctx);
+    if (g_model) llama_model_free(g_model);
+}
+}

engine/libbatch.so ADDED Viewed

Binary file (32.6 kB). View file

model/config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "name": "phi3",
+    "architecture": "phi",
+    "context_length": 4096,
+    "quantization": "Q4_K_M"
+}

model/template.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+<|system|>
+You are a helpful AI assistant.
+<|end|>
+<|user|>
+{{prompt}}
+<|end|>
+<|assistant|>

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+fastapi
+uvicorn
+llama-cpp-python

setup_model.sh ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/bin/bash
+set -e
+# Default URL (Phi-3 Mini 4k Instruct)
+DEFAULT_URL="https://huggingface.co/prithivMLmods/Nanbeige4.1-3B-f32-GGUF?show_file_info=Nanbeige4.1-3B.Q8_0.gguf"
+MODEL_URL=${1:-$DEFAULT_URL}
+MODEL_DIR="model"
+ENGINE_DIR="engine"
+echo ">>> Setting up Ultra-Fast LLM..."
+# 1. Setup Model Directory
+if [ ! -d "$MODEL_DIR" ]; then
+    mkdir -p "$MODEL_DIR"
+fi
+# 2. Download Model
+if [ ! -f "$MODEL_DIR/model.gguf" ]; then
+    echo ">>> Downloading model from $MODEL_URL..."
+    wget -O "$MODEL_DIR/model.gguf" "$MODEL_URL" --show-progress
+else
+    echo ">>> Model already exists using existing..."
+fi
+# 3. Create Template (Phi-3 default)
+if [ ! -f "$MODEL_DIR/template.txt" ]; then
+    echo "<|user|>\n{{prompt}}<|end|>\n<|assistant|>" > "$MODEL_DIR/template.txt"
+    echo ">>> Created default Phi-3 template."
+fi
+# 4. Check for Virtual Environment
+if [ ! -d ".venv" ]; then
+    echo ">>> Creating Python virtual environment..."
+    python3 -m venv .venv
+    source .venv/bin/activate
+    pip install -r requirements.txt
+else
+    source .venv/bin/activate
+fi
+# 5. Compile Engine
+echo ">>> Compiling C++ Optimization Engine..."
+if [ -d "$ENGINE_DIR" ]; then
+    cd "$ENGINE_DIR"
+    g++ -O2 -march=native -shared -fPIC -o libbatch.so batch_server.cpp \
+      -I"../.venv/lib/python3.12/site-packages/include" \
+      -L"../.venv/lib/python3.12/site-packages/llama_cpp/lib" \
+      -lllama -Wl,-rpath,"../.venv/lib/python3.12/site-packages/llama_cpp/lib"
+    cd ..
+else
+    echo "Error: engine directory not found!"
+    exit 1
+fi
+echo ">>> Setup Complete!"
+echo "Run server with: cd api && ../.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000"