Karan6933 commited on
Commit
8d11530
·
verified ·
1 Parent(s): 7ddf91c

Upload 17 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a lightweight Python base
2
+ FROM python:3.12-slim
3
+
4
+ # Install build tools
5
+ RUN apt-get update && apt-get install -y \
6
+ build-essential \
7
+ wget \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ WORKDIR /app
11
+
12
+ # Copy requirements first for cache
13
+ COPY requirements.txt .
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Copy project files
17
+ COPY . .
18
+
19
+ # Setup environment variables for compilation
20
+ # We need to find where pip installed llama-cpp-python to link against it
21
+ # In docker, it's usually /usr/local/lib/python3.12/site-packages
22
+ ENV SITE_PACKAGES=/usr/local/lib/python3.12/site-packages
23
+
24
+ # Compile the engine
25
+ WORKDIR /app/engine
26
+ RUN g++ -O2 -shared -fPIC -o libbatch.so batch_server.cpp \
27
+ -I"${SITE_PACKAGES}/include" \
28
+ -L"${SITE_PACKAGES}/llama_cpp/lib" \
29
+ -lllama -Wl,-rpath,"${SITE_PACKAGES}/llama_cpp/lib"
30
+
31
+ # Setup Model (Download during build or mount volume?
32
+ # Best practice: Download in build if small, or use script at runtime.
33
+ # Here we'll rely on the user mounting the model or running the setup script.
34
+ # But for "Tunnel Code Optimized", let's assume valid model is present or downloaded.
35
+ # We'll expose the setup script.)
36
+
37
+ WORKDIR /app/api
38
+ EXPOSE 8000
39
+
40
+ # Start command
41
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
api/__pycache__/batcher.cpython-312.pyc ADDED
Binary file (2.05 kB). View file
 
api/__pycache__/bridge.cpython-312.pyc ADDED
Binary file (3.03 kB). View file
 
api/__pycache__/engine.cpython-312.pyc ADDED
Binary file (1.78 kB). View file
 
api/__pycache__/main.cpython-312.pyc ADDED
Binary file (3.32 kB). View file
 
api/__pycache__/server.cpython-312.pyc ADDED
Binary file (2.06 kB). View file
 
api/batcher.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ class BatchScheduler:
4
+ def __init__(self, max_batch=8, max_wait_ms=30):
5
+ self.queue = []
6
+ self.max_batch = max_batch
7
+ self.max_wait_ms = max_wait_ms
8
+ self.lock = asyncio.Lock()
9
+
10
+ async def add(self, prompt: str):
11
+ # Create a queue for streaming tokens
12
+ queue = asyncio.Queue()
13
+ async with self.lock:
14
+ self.queue.append((prompt, queue))
15
+ return queue
16
+
17
+ async def get_batch(self):
18
+ if not self.queue:
19
+ return None
20
+
21
+ # Artificial wait to accumulate requests
22
+ await asyncio.sleep(self.max_wait_ms / 1000)
23
+
24
+ async with self.lock:
25
+ # Take up to max_batch items from the queue
26
+ batch = self.queue[:self.max_batch]
27
+ self.queue = self.queue[self.max_batch:]
28
+
29
+ return batch if batch else None
api/bridge.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import os
3
+
4
+ # Load the shared library
5
+ LIB_PATH = os.path.abspath("../engine/libbatch.so")
6
+ if not os.path.exists(LIB_PATH):
7
+ raise FileNotFoundError(f"Shared library not found at: {LIB_PATH}. Did you compile the engine?")
8
+ lib = ctypes.CDLL(LIB_PATH)
9
+
10
+ # Define function signatures
11
+ lib.init_model.argtypes = [ctypes.c_char_p]
12
+ lib.init_model.restype = ctypes.c_bool
13
+
14
+ # Define function signatures for streaming
15
+ lib.start_batch.argtypes = [
16
+ ctypes.POINTER(ctypes.c_char_p), # prompts
17
+ ctypes.c_int, # count
18
+ ctypes.c_int # max_tokens
19
+ ]
20
+ lib.start_batch.restype = None
21
+
22
+ lib.decode_step.argtypes = [
23
+ ctypes.POINTER(ctypes.c_char_p) # results
24
+ ]
25
+ lib.decode_step.restype = ctypes.c_bool
26
+
27
+ # Load template
28
+ with open("../model/template.txt", "r") as f:
29
+ TEMPLATE = f.read()
30
+
31
+ def format_prompt(prompt: str) -> str:
32
+ return TEMPLATE.replace("{{prompt}}", prompt)
33
+
34
+ # Initialize the model
35
+ MODEL_PATH = os.path.abspath("../model/model.gguf").encode('utf-8')
36
+ if not lib.init_model(MODEL_PATH):
37
+ print(f"Failed to initialize model at {MODEL_PATH}")
38
+
39
+ def stream_batch(prompts):
40
+ count = len(prompts)
41
+
42
+ # Apply Ollama-style templates
43
+ formatted_prompts = [format_prompt(p) for p in prompts]
44
+
45
+ c_prompts = (ctypes.c_char_p * count)(*[p.encode('utf-8') for p in formatted_prompts])
46
+ c_results = (ctypes.c_char_p * count)()
47
+
48
+ # 1. Start Batch (Prefill)
49
+ lib.start_batch(c_prompts, count, 256)
50
+
51
+ # 2. Decode Loop
52
+ while True:
53
+ # Run one step
54
+ active = lib.decode_step(c_results)
55
+
56
+ # Collect results for this step
57
+ step_output = []
58
+ for i in range(count):
59
+ res = c_results[i]
60
+ if res:
61
+ text = res.decode('utf-8')
62
+ step_output.append(text)
63
+ # libc.free(res) # Ideally free, but for now we rely on OS cleanup or leak small amount in this demo
64
+ else:
65
+ step_output.append(None)
66
+
67
+ yield step_output
68
+
69
+ if not active:
70
+ break
71
+
72
+
73
+
api/main.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.responses import StreamingResponse
3
+ from batcher import BatchScheduler
4
+ from bridge import stream_batch
5
+ import asyncio
6
+ import time
7
+
8
+ app = FastAPI()
9
+ scheduler = BatchScheduler(max_batch=8, max_wait_ms=30)
10
+
11
+ # In-memory chat history (per process, for demo)
12
+ chat_histories = {}
13
+
14
+ @app.post("/chat")
15
+ async def chat(prompt: str, session_id: str = "default"):
16
+ # Simple history management
17
+ if session_id not in chat_histories:
18
+ chat_histories[session_id] = []
19
+
20
+ # Contextual prompt construction
21
+ history = "\n".join(chat_histories[session_id])
22
+ if history:
23
+ full_prompt = f"{history}\n{prompt}"
24
+ else:
25
+ full_prompt = prompt
26
+
27
+ # Get the queue for this request
28
+ token_queue = await scheduler.add(full_prompt)
29
+
30
+ # Generator to yield tokens from the queue
31
+ async def response_generator():
32
+ full_response = []
33
+ while True:
34
+ token = await token_queue.get()
35
+ if token is None:
36
+ break
37
+ yield token
38
+ full_response.append(token)
39
+
40
+ # After streaming is done, update history
41
+ # Note: This runs after the response closes, might need background task if strict
42
+ # But for generator, code continues after yield
43
+ response_text = "".join(full_response)
44
+ chat_histories[session_id].append(f"User: {prompt}")
45
+ chat_histories[session_id].append(f"AI: {response_text}")
46
+
47
+ # Keep history concise
48
+ if len(chat_histories[session_id]) > 10:
49
+ chat_histories[session_id] = chat_histories[session_id][-10:]
50
+
51
+ return StreamingResponse(response_generator(), media_type="text/plain")
52
+
53
+ async def batch_loop():
54
+ print("Batch loop started...")
55
+ while True:
56
+ # Wait for a batch
57
+ batch = await scheduler.get_batch()
58
+ if not batch:
59
+ await asyncio.sleep(0.01) # Short sleep if empty
60
+ continue
61
+
62
+ # Process batch
63
+ prompts, queues = zip(*batch)
64
+ print(f"Processing batch of {len(prompts)} prompts")
65
+
66
+ # Stream from C++ engine
67
+ # Iterate over the generator which yields step-by-step tokens
68
+ for step_tokens in stream_batch(prompts):
69
+ for q, token in zip(queues, step_tokens):
70
+ if token is not None:
71
+ q.put_nowait(token)
72
+ # Yield control to event loop to let FastAPI flush tokens
73
+ await asyncio.sleep(0)
74
+
75
+ # Signal done
76
+ for q in queues:
77
+ q.put_nowait(None)
78
+
79
+ @app.on_event("startup")
80
+ async def startup_event():
81
+ asyncio.create_task(batch_loop())
api/server_logs.txt ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ llama_model_loader: loaded meta data with 24 key-value pairs and 195 tensors from /home/karanpc/Desktop/Ollama Copy/ultra-fast-llm-pro/model/model.gguf (version GGUF V3 (latest))
2
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
3
+ llama_model_loader: - kv 0: general.architecture str = phi3
4
+ llama_model_loader: - kv 1: general.name str = Phi3
5
+ llama_model_loader: - kv 2: phi3.context_length u32 = 4096
6
+ llama_model_loader: - kv 3: phi3.embedding_length u32 = 3072
7
+ llama_model_loader: - kv 4: phi3.feed_forward_length u32 = 8192
8
+ llama_model_loader: - kv 5: phi3.block_count u32 = 32
9
+ llama_model_loader: - kv 6: phi3.attention.head_count u32 = 32
10
+ llama_model_loader: - kv 7: phi3.attention.head_count_kv u32 = 32
11
+ llama_model_loader: - kv 8: phi3.attention.layer_norm_rms_epsilon f32 = 0.000010
12
+ llama_model_loader: - kv 9: phi3.rope.dimension_count u32 = 96
13
+ llama_model_loader: - kv 10: general.file_type u32 = 15
14
+ llama_model_loader: - kv 11: tokenizer.ggml.model str = llama
15
+ llama_model_loader: - kv 12: tokenizer.ggml.pre str = default
16
+ llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32064] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
17
+ llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32064] = [0.000000, 0.000000, 0.000000, 0.0000...
18
+ llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32064] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
19
+ llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1
20
+ llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 32000
21
+ llama_model_loader: - kv 18: tokenizer.ggml.unknown_token_id u32 = 0
22
+ llama_model_loader: - kv 19: tokenizer.ggml.padding_token_id u32 = 32000
23
+ llama_model_loader: - kv 20: tokenizer.ggml.add_bos_token bool = true
24
+ llama_model_loader: - kv 21: tokenizer.ggml.add_eos_token bool = false
25
+ llama_model_loader: - kv 22: tokenizer.chat_template str = {{ bos_token }}{% for message in mess...
26
+ llama_model_loader: - kv 23: general.quantization_version u32 = 2
27
+ llama_model_loader: - type f32: 65 tensors
28
+ llama_model_loader: - type q4_K: 81 tensors
29
+ llama_model_loader: - type q5_K: 32 tensors
30
+ llama_model_loader: - type q6_K: 17 tensors
31
+ print_info: file format = GGUF V3 (latest)
32
+ print_info: file type = Q4_K - Medium
33
+ print_info: file size = 2.23 GiB (5.01 BPW)
34
+ init_tokenizer: initializing tokenizer for type 1
35
+ load: control-looking token: 32007 '<|end|>' was not control-type; this is probably a bug in the model. its type will be overridden
36
+ load: control-looking token: 32000 '<|endoftext|>' was not control-type; this is probably a bug in the model. its type will be overridden
37
+ load: control token: 2 '</s>' is not marked as EOG
38
+ load: control token: 1 '<s>' is not marked as EOG
39
+ load: printing all EOG tokens:
40
+ load: - 32000 ('<|endoftext|>')
41
+ load: - 32007 ('<|end|>')
42
+ load: special tokens cache size = 67
43
+ load: token to piece cache size = 0.1690 MB
44
+ print_info: arch = phi3
45
+ print_info: vocab_only = 0
46
+ print_info: n_ctx_train = 4096
47
+ print_info: n_embd = 3072
48
+ print_info: n_layer = 32
49
+ print_info: n_head = 32
50
+ print_info: n_head_kv = 32
51
+ print_info: n_rot = 96
52
+ print_info: n_swa = 0
53
+ print_info: is_swa_any = 0
54
+ print_info: n_embd_head_k = 96
55
+ print_info: n_embd_head_v = 96
56
+ print_info: n_gqa = 1
57
+ print_info: n_embd_k_gqa = 3072
58
+ print_info: n_embd_v_gqa = 3072
59
+ print_info: f_norm_eps = 0.0e+00
60
+ print_info: f_norm_rms_eps = 1.0e-05
61
+ print_info: f_clamp_kqv = 0.0e+00
62
+ print_info: f_max_alibi_bias = 0.0e+00
63
+ print_info: f_logit_scale = 0.0e+00
64
+ print_info: f_attn_scale = 0.0e+00
65
+ print_info: n_ff = 8192
66
+ print_info: n_expert = 0
67
+ print_info: n_expert_used = 0
68
+ print_info: causal attn = 1
69
+ print_info: pooling type = 0
70
+ print_info: rope type = 2
71
+ print_info: rope scaling = linear
72
+ print_info: freq_base_train = 10000.0
73
+ print_info: freq_scale_train = 1
74
+ print_info: n_ctx_orig_yarn = 4096
75
+ print_info: rope_finetuned = unknown
76
+ print_info: model type = 3B
77
+ print_info: model params = 3.82 B
78
+ print_info: general.name = Phi3
79
+ print_info: vocab type = SPM
80
+ print_info: n_vocab = 32064
81
+ print_info: n_merges = 0
82
+ print_info: BOS token = 1 '<s>'
83
+ print_info: EOS token = 32000 '<|endoftext|>'
84
+ print_info: EOT token = 32007 '<|end|>'
85
+ print_info: UNK token = 0 '<unk>'
86
+ print_info: PAD token = 32000 '<|endoftext|>'
87
+ print_info: LF token = 13 '<0x0A>'
88
+ print_info: EOG token = 32000 '<|endoftext|>'
89
+ print_info: EOG token = 32007 '<|end|>'
90
+ print_info: max token length = 48
91
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
92
+ load_tensors: layer 0 assigned to device CPU, is_swa = 0
93
+ load_tensors: layer 1 assigned to device CPU, is_swa = 0
94
+ load_tensors: layer 2 assigned to device CPU, is_swa = 0
95
+ load_tensors: layer 3 assigned to device CPU, is_swa = 0
96
+ load_tensors: layer 4 assigned to device CPU, is_swa = 0
97
+ load_tensors: layer 5 assigned to device CPU, is_swa = 0
98
+ load_tensors: layer 6 assigned to device CPU, is_swa = 0
99
+ load_tensors: layer 7 assigned to device CPU, is_swa = 0
100
+ load_tensors: layer 8 assigned to device CPU, is_swa = 0
101
+ load_tensors: layer 9 assigned to device CPU, is_swa = 0
102
+ load_tensors: layer 10 assigned to device CPU, is_swa = 0
103
+ load_tensors: layer 11 assigned to device CPU, is_swa = 0
104
+ load_tensors: layer 12 assigned to device CPU, is_swa = 0
105
+ load_tensors: layer 13 assigned to device CPU, is_swa = 0
106
+ load_tensors: layer 14 assigned to device CPU, is_swa = 0
107
+ load_tensors: layer 15 assigned to device CPU, is_swa = 0
108
+ load_tensors: layer 16 assigned to device CPU, is_swa = 0
109
+ load_tensors: layer 17 assigned to device CPU, is_swa = 0
110
+ load_tensors: layer 18 assigned to device CPU, is_swa = 0
111
+ load_tensors: layer 19 assigned to device CPU, is_swa = 0
112
+ load_tensors: layer 20 assigned to device CPU, is_swa = 0
113
+ load_tensors: layer 21 assigned to device CPU, is_swa = 0
114
+ load_tensors: layer 22 assigned to device CPU, is_swa = 0
115
+ load_tensors: layer 23 assigned to device CPU, is_swa = 0
116
+ load_tensors: layer 24 assigned to device CPU, is_swa = 0
117
+ load_tensors: layer 25 assigned to device CPU, is_swa = 0
118
+ load_tensors: layer 26 assigned to device CPU, is_swa = 0
119
+ load_tensors: layer 27 assigned to device CPU, is_swa = 0
120
+ load_tensors: layer 28 assigned to device CPU, is_swa = 0
121
+ load_tensors: layer 29 assigned to device CPU, is_swa = 0
122
+ load_tensors: layer 30 assigned to device CPU, is_swa = 0
123
+ load_tensors: layer 31 assigned to device CPU, is_swa = 0
124
+ load_tensors: layer 32 assigned to device CPU, is_swa = 0
125
+ load_tensors: tensor 'token_embd.weight' (q4_K) (and 114 others) cannot be used with preferred buffer type CPU_REPACK, using CPU instead
126
+ load_tensors: CPU_REPACK model buffer size = 1242.00 MiB
127
+ load_tensors: CPU_Mapped model buffer size = 2281.66 MiB
128
+ repack: repack tensor blk.0.attn_output.weight with q4_K_8x8
129
+ repack: repack tensor blk.0.ffn_up.weight with q4_K_8x8
130
+ .repack: repack tensor blk.1.attn_output.weight with q4_K_8x8
131
+ repack: repack tensor blk.1.ffn_up.weight with q4_K_8x8
132
+ .repack: repack tensor blk.2.attn_output.weight with q4_K_8x8
133
+ .repack: repack tensor blk.2.ffn_up.weight with q4_K_8x8
134
+ .repack: repack tensor blk.3.attn_output.weight with q4_K_8x8
135
+ repack: repack tensor blk.3.ffn_down.weight with q4_K_8x8
136
+ .repack: repack tensor blk.3.ffn_up.weight with q4_K_8x8
137
+ .repack: repack tensor blk.4.attn_output.weight with q4_K_8x8
138
+ repack: repack tensor blk.4.ffn_up.weight with q4_K_8x8
139
+ .repack: repack tensor blk.5.attn_output.weight with q4_K_8x8
140
+ repack: repack tensor blk.5.ffn_down.weight with q4_K_8x8
141
+ .repack: repack tensor blk.5.ffn_up.weight with q4_K_8x8
142
+ .repack: repack tensor blk.6.attn_output.weight with q4_K_8x8
143
+ repack: repack tensor blk.6.ffn_down.weight with q4_K_8x8
144
+ .repack: repack tensor blk.6.ffn_up.weight with q4_K_8x8
145
+ .repack: repack tensor blk.7.attn_output.weight with q4_K_8x8
146
+ repack: repack tensor blk.7.ffn_up.weight with q4_K_8x8
147
+ .repack: repack tensor blk.8.attn_output.weight with q4_K_8x8
148
+ repack: repack tensor blk.8.ffn_down.weight with q4_K_8x8
149
+ repack: repack tensor blk.8.ffn_up.weight with q4_K_8x8
150
+ .repack: repack tensor blk.9.attn_output.weight with q4_K_8x8
151
+ repack: repack tensor blk.9.ffn_down.weight with q4_K_8x8
152
+ repack: repack tensor blk.9.ffn_up.weight with q4_K_8x8
153
+ .repack: repack tensor blk.10.attn_output.weight with q4_K_8x8
154
+ repack: repack tensor blk.10.ffn_up.weight with q4_K_8x8
155
+ .repack: repack tensor blk.11.attn_output.weight with q4_K_8x8
156
+ repack: repack tensor blk.11.ffn_up.weight with q4_K_8x8
157
+ .repack: repack tensor blk.12.attn_output.weight with q4_K_8x8
158
+ .repack: repack tensor blk.12.ffn_down.weight with q4_K_8x8
159
+ repack: repack tensor blk.12.ffn_up.weight with q4_K_8x8
160
+ .repack: repack tensor blk.13.attn_output.weight with q4_K_8x8
161
+ .repack: repack tensor blk.13.ffn_down.weight with q4_K_8x8
162
+ repack: repack tensor blk.13.ffn_up.weight with q4_K_8x8
163
+ .repack: repack tensor blk.14.attn_output.weight with q4_K_8x8
164
+ .repack: repack tensor blk.14.ffn_up.weight with q4_K_8x8
165
+ .repack: repack tensor blk.15.attn_output.weight with q4_K_8x8
166
+ repack: repack tensor blk.15.ffn_down.weight with q4_K_8x8
167
+ .repack: repack tensor blk.15.ffn_up.weight with q4_K_8x8
168
+ .repack: repack tensor blk.16.attn_output.weight with q4_K_8x8
169
+ repack: repack tensor blk.16.ffn_down.weight with q4_K_8x8
170
+ .repack: repack tensor blk.16.ffn_up.weight with q4_K_8x8
171
+ .repack: repack tensor blk.17.attn_output.weight with q4_K_8x8
172
+ repack: repack tensor blk.17.ffn_up.weight with q4_K_8x8
173
+ .repack: repack tensor blk.18.attn_output.weight with q4_K_8x8
174
+ repack: repack tensor blk.18.ffn_down.weight with q4_K_8x8
175
+ .repack: repack tensor blk.18.ffn_up.weight with q4_K_8x8
176
+ .repack: repack tensor blk.19.attn_output.weight with q4_K_8x8
177
+ repack: repack tensor blk.19.ffn_down.weight with q4_K_8x8
178
+ .repack: repack tensor blk.19.ffn_up.weight with q4_K_8x8
179
+ .repack: repack tensor blk.20.attn_output.weight with q4_K_8x8
180
+ repack: repack tensor blk.20.ffn_down.weight with q4_K_8x8
181
+ .repack: repack tensor blk.20.ffn_up.weight with q4_K_8x8
182
+ .repack: repack tensor blk.21.attn_output.weight with q4_K_8x8
183
+ repack: repack tensor blk.21.ffn_up.weight with q4_K_8x8
184
+ .repack: repack tensor blk.22.attn_output.weight with q4_K_8x8
185
+ repack: repack tensor blk.22.ffn_down.weight with q4_K_8x8
186
+ repack: repack tensor blk.22.ffn_up.weight with q4_K_8x8
187
+ .repack: repack tensor blk.23.attn_output.weight with q4_K_8x8
188
+ repack: repack tensor blk.23.ffn_down.weight with q4_K_8x8
189
+ repack: repack tensor blk.23.ffn_up.weight with q4_K_8x8
190
+ .repack: repack tensor blk.24.attn_output.weight with q4_K_8x8
191
+ repack: repack tensor blk.24.ffn_up.weight with q4_K_8x8
192
+ .repack: repack tensor blk.25.attn_output.weight with q4_K_8x8
193
+ repack: repack tensor blk.25.ffn_down.weight with q4_K_8x8
194
+ .repack: repack tensor blk.25.ffn_up.weight with q4_K_8x8
195
+ .repack: repack tensor blk.26.attn_output.weight with q4_K_8x8
196
+ repack: repack tensor blk.26.ffn_down.weight with q4_K_8x8
197
+ .repack: repack tensor blk.26.ffn_up.weight with q4_K_8x8
198
+ .repack: repack tensor blk.27.attn_output.weight with q4_K_8x8
199
+ repack: repack tensor blk.27.ffn_up.weight with q4_K_8x8
200
+ .repack: repack tensor blk.28.attn_output.weight with q4_K_8x8
201
+ .repack: repack tensor blk.28.ffn_up.weight with q4_K_8x8
202
+ .repack: repack tensor blk.29.attn_output.weight with q4_K_8x8
203
+ repack: repack tensor blk.29.ffn_up.weight with q4_K_8x8
204
+ .repack: repack tensor blk.30.attn_output.weight with q4_K_8x8
205
+ repack: repack tensor blk.30.ffn_up.weight with q4_K_8x8
206
+ .repack: repack tensor blk.31.attn_output.weight with q4_K_8x8
207
+ repack: repack tensor blk.31.ffn_up.weight with q4_K_8x8
208
+ ...........................................
209
+ llama_context: constructing llama_context
210
+ llama_context: n_seq_max = 16
211
+ llama_context: n_ctx = 4096
212
+ llama_context: n_ctx_per_seq = 256
213
+ llama_context: n_batch = 512
214
+ llama_context: n_ubatch = 512
215
+ llama_context: causal_attn = 1
216
+ llama_context: flash_attn = 0
217
+ llama_context: kv_unified = false
218
+ llama_context: freq_base = 10000.0
219
+ llama_context: freq_scale = 1
220
+ llama_context: n_ctx_per_seq (256) < n_ctx_train (4096) -- the full capacity of the model will not be utilized
221
+ set_abort_callback: call
222
+ llama_context: CPU output buffer size = 1.96 MiB
223
+ create_memory: n_ctx = 4096 (padded)
224
+ llama_kv_cache_unified: layer 0: dev = CPU
225
+ llama_kv_cache_unified: layer 1: dev = CPU
226
+ llama_kv_cache_unified: layer 2: dev = CPU
227
+ llama_kv_cache_unified: layer 3: dev = CPU
228
+ llama_kv_cache_unified: layer 4: dev = CPU
229
+ llama_kv_cache_unified: layer 5: dev = CPU
230
+ llama_kv_cache_unified: layer 6: dev = CPU
231
+ llama_kv_cache_unified: layer 7: dev = CPU
232
+ llama_kv_cache_unified: layer 8: dev = CPU
233
+ llama_kv_cache_unified: layer 9: dev = CPU
234
+ llama_kv_cache_unified: layer 10: dev = CPU
235
+ llama_kv_cache_unified: layer 11: dev = CPU
236
+ llama_kv_cache_unified: layer 12: dev = CPU
237
+ llama_kv_cache_unified: layer 13: dev = CPU
238
+ llama_kv_cache_unified: layer 14: dev = CPU
239
+ llama_kv_cache_unified: layer 15: dev = CPU
240
+ llama_kv_cache_unified: layer 16: dev = CPU
241
+ llama_kv_cache_unified: layer 17: dev = CPU
242
+ llama_kv_cache_unified: layer 18: dev = CPU
243
+ llama_kv_cache_unified: layer 19: dev = CPU
244
+ llama_kv_cache_unified: layer 20: dev = CPU
245
+ llama_kv_cache_unified: layer 21: dev = CPU
246
+ llama_kv_cache_unified: layer 22: dev = CPU
247
+ llama_kv_cache_unified: layer 23: dev = CPU
248
+ llama_kv_cache_unified: layer 24: dev = CPU
249
+ llama_kv_cache_unified: layer 25: dev = CPU
250
+ llama_kv_cache_unified: layer 26: dev = CPU
251
+ llama_kv_cache_unified: layer 27: dev = CPU
252
+ llama_kv_cache_unified: layer 28: dev = CPU
253
+ llama_kv_cache_unified: layer 29: dev = CPU
254
+ llama_kv_cache_unified: layer 30: dev = CPU
255
+ llama_kv_cache_unified: layer 31: dev = CPU
256
+ llama_kv_cache_unified: CPU KV buffer size = 1536.00 MiB
257
+ llama_kv_cache_unified: size = 1536.00 MiB ( 256 cells, 32 layers, 16/16 seqs), K (f16): 768.00 MiB, V (f16): 768.00 MiB
258
+ llama_context: enumerating backends
259
+ llama_context: backend_ptrs.size() = 1
260
+ llama_context: max_nodes = 1560
261
+ llama_context: worst-case: n_tokens = 512, n_seqs = 16, n_outputs = 0
262
+ graph_reserve: reserving a graph for ubatch with n_tokens = 512, n_seqs = 16, n_outputs = 512
263
+ graph_reserve: reserving a graph for ubatch with n_tokens = 16, n_seqs = 16, n_outputs = 16
264
+ graph_reserve: reserving a graph for ubatch with n_tokens = 512, n_seqs = 16, n_outputs = 512
265
+ llama_context: CPU compute buffer size = 73.01 MiB
266
+ llama_context: graph nodes = 1126
267
+ llama_context: graph splits = 1
268
+ INFO: Started server process [12669]
269
+ INFO: Waiting for application startup.
270
+ INFO: Application startup complete.
271
+ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
272
+ Batch loop started...
273
+ INFO: 127.0.0.1:59484 - "POST /chat?prompt=Count%20to%2010 HTTP/1.1" 200 OK
engine/CMakeLists.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 3.10)
2
+ project(llama_batch_engine)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
6
+
7
+ # Find llama.cpp (assuming it's in the parent directory)
8
+ set(LLAMA_DIR "../../llama.cpp")
9
+ include_directories(${LLAMA_DIR}/include ${LLAMA_DIR}/common ${LLAMA_DIR})
10
+
11
+ # Sources
12
+ set(SOURCES batch_server.cpp)
13
+
14
+ # Build the shared library
15
+ add_library(batch SHARED ${SOURCES})
16
+
17
+ # Note: In a real environment, you'd link against the compiled llama.cpp library
18
+ # For this task, we assume the user has llama.cpp compiled or we link the objects.
19
+ # Here we'll just define the output name.
20
+ set_target_properties(batch PROPERTIES OUTPUT_NAME "batch")
engine/batch_server.cpp ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "llama.h"
2
+ #include <vector>
3
+ #include <string>
4
+ #include <cstring>
5
+ #include <cstdio>
6
+ #include <cstdlib>
7
+
8
+ // Global context for the loaded model
9
+ static llama_model* g_model = nullptr;
10
+ static llama_context* g_ctx = nullptr;
11
+ static llama_sampler* g_smpl = nullptr;
12
+
13
+ extern "C" {
14
+
15
+ // Initialize the model
16
+ bool init_model(const char* model_path) {
17
+ llama_backend_init();
18
+
19
+ llama_model_params model_params = llama_model_default_params();
20
+ model_params.n_gpu_layers = 0; // CPU only for now
21
+ g_model = llama_model_load_from_file(model_path, model_params);
22
+
23
+ if (!g_model) return false;
24
+
25
+ llama_context_params ctx_params = llama_context_default_params();
26
+ ctx_params.n_ctx = 4096; // 256 per sequence for 16 users
27
+ ctx_params.n_batch = 512;
28
+ ctx_params.n_threads = 8;
29
+ ctx_params.n_threads_batch = 8;
30
+ ctx_params.n_seq_max = 16;
31
+ g_ctx = llama_init_from_model(g_model, ctx_params);
32
+
33
+ if (!g_ctx) return false;
34
+
35
+ auto sparams = llama_sampler_chain_default_params();
36
+ g_smpl = llama_sampler_chain_init(sparams);
37
+ llama_sampler_chain_add(g_smpl, llama_sampler_init_greedy());
38
+
39
+ return true;
40
+ }
41
+
42
+ // Helper to add a token to a batch manually
43
+ void batch_add(llama_batch & batch, llama_token id, llama_pos pos, const std::vector<llama_seq_id> & seq_ids, bool logits) {
44
+ batch.token[batch.n_tokens] = id;
45
+ batch.pos[batch.n_tokens] = pos;
46
+ batch.n_seq_id[batch.n_tokens] = seq_ids.size();
47
+ for (size_t i = 0; i < seq_ids.size(); ++i) {
48
+ batch.seq_id[batch.n_tokens][i] = seq_ids[i];
49
+ }
50
+ batch.logits[batch.n_tokens] = logits;
51
+ batch.n_tokens++;
52
+ }
53
+
54
+ // Global state for streaming
55
+ static int g_count = 0;
56
+ static int g_step = 0;
57
+ static int g_max_tokens = 0;
58
+ static std::vector<std::string> g_responses;
59
+ static std::vector<bool> g_active;
60
+ static std::vector<int> g_n_pos;
61
+ static std::vector<int> g_logits_idx;
62
+ static std::vector<std::vector<llama_token>> g_all_tokens;
63
+ static llama_batch g_batch;
64
+ static const llama_vocab* g_vocab = nullptr;
65
+
66
+ // 1. Start a new batch (Prefill)
67
+ void start_batch(const char** prompts, int count, int max_tokens) {
68
+ if (!g_ctx || count == 0) return;
69
+
70
+ g_vocab = llama_model_get_vocab(g_model);
71
+ g_count = count;
72
+ g_max_tokens = max_tokens;
73
+ g_step = 0;
74
+
75
+ // Reset state
76
+ g_responses.assign(count, "");
77
+ g_active.assign(count, true);
78
+ g_n_pos.assign(count, 0);
79
+ g_logits_idx.assign(count, -1);
80
+ g_all_tokens.clear();
81
+
82
+ // Tokenize
83
+ for (int i = 0; i < count; i++) {
84
+ int n_prompt = -llama_tokenize(g_vocab, prompts[i], strlen(prompts[i]), NULL, 0, true, true);
85
+ std::vector<llama_token> tokens(n_prompt);
86
+ llama_tokenize(g_vocab, prompts[i], strlen(prompts[i]), tokens.data(), tokens.size(), true, true);
87
+ g_all_tokens.push_back(tokens);
88
+ }
89
+
90
+ // Clear KV cache
91
+ llama_memory_clear(llama_get_memory(g_ctx), true);
92
+
93
+ // Init batch
94
+ if (g_batch.token) llama_batch_free(g_batch); // Free if exists
95
+ g_batch = llama_batch_init(4096, 0, 1); // Larger batch for safety
96
+
97
+ // Prefill
98
+ g_batch.n_tokens = 0;
99
+ for (int i = 0; i < count; i++) {
100
+ for (size_t j = 0; j < g_all_tokens[i].size(); j++) {
101
+ bool is_last = (j == g_all_tokens[i].size() - 1);
102
+ if (is_last) g_logits_idx[i] = g_batch.n_tokens;
103
+ batch_add(g_batch, g_all_tokens[i][j], g_n_pos[i]++, { (llama_seq_id)i }, is_last);
104
+ }
105
+ }
106
+
107
+ // Decode Prefill
108
+ if (llama_decode(g_ctx, g_batch)) {
109
+ fprintf(stderr, "Failed to decode prefill\n");
110
+ }
111
+ }
112
+
113
+ // 2. Decode one step (Generate next token for all active sequences)
114
+ // Returns true if any sequence is still active, false if all done
115
+ bool decode_step(const char** results) {
116
+ if (g_step >= g_max_tokens) return false;
117
+
118
+ g_batch.n_tokens = 0;
119
+ bool any_active = false;
120
+ std::vector<int> next_logits_idx(g_count, -1);
121
+ int current_batch_pos = 0;
122
+
123
+ for (int i = 0; i < g_count; i++) {
124
+ results[i] = nullptr; // Default to null (no new token this step if inactive)
125
+
126
+ if (!g_active[i]) continue;
127
+
128
+ // Sample
129
+ llama_token id = llama_sampler_sample(g_smpl, g_ctx, g_logits_idx[i]);
130
+ llama_sampler_accept(g_smpl, id);
131
+
132
+ // Check EOG/limit
133
+ if (llama_vocab_is_eog(g_vocab, id) || g_n_pos[i] >= 4096) { // Hard limit matches n_ctx
134
+ g_active[i] = false;
135
+ continue;
136
+ }
137
+
138
+ // Decode token to string
139
+ static char buf[256]; // Static buffer for simplicity (not thread safe across batches, but fine here)
140
+ int n = llama_token_to_piece(g_vocab, id, buf, sizeof(buf), 0, true);
141
+ if (n < 0) {
142
+ // error or empty
143
+ } else {
144
+ // Allocate new string for result to pass back to Python
145
+ // Python side must handle this appropriately or we just copy
146
+ // For ctypes `const char**`, we need a pointer that persists until next call
147
+ // We can use a thread_local buffer or just strdup and let Python manage?
148
+ // Ideally Python frees, but to keep it simple we use a persistent buffer in C++?
149
+ // No, let's just return a pointer to a static buffer is risky if multiple threads.
150
+ // But we are single threaded engine key.
151
+ // Safer: return strdup, Python bridge won't free it effortlessly though.
152
+ // Let's use a global vector of string buffers for the current step.
153
+ }
154
+
155
+ // Actually, we just need to return the piece.
156
+ // Let's rely on a persistent buffer for the current step.
157
+ // We can just strdup and leak if we don't free? No.
158
+ // Let's assume the Python side consumes it immediately.
159
+ results[i] = strdup(buf);
160
+
161
+ next_logits_idx[i] = current_batch_pos++;
162
+ batch_add(g_batch, id, g_n_pos[i]++, { (llama_seq_id)i }, true);
163
+ any_active = true;
164
+ }
165
+
166
+ if (!any_active) return false;
167
+
168
+ g_logits_idx = next_logits_idx;
169
+ if (llama_decode(g_ctx, g_batch)) {
170
+ return false;
171
+ }
172
+
173
+ g_step++;
174
+ return true;
175
+ }
176
+
177
+ // Cleanup
178
+ void cleanup() {
179
+ if (g_smpl) llama_sampler_free(g_smpl);
180
+ if (g_ctx) llama_free(g_ctx);
181
+ if (g_model) llama_model_free(g_model);
182
+ }
183
+
184
+ }
engine/libbatch.so ADDED
Binary file (32.6 kB). View file
 
model/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "name": "phi3",
3
+ "architecture": "phi",
4
+ "context_length": 4096,
5
+ "quantization": "Q4_K_M"
6
+ }
model/template.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ <|system|>
2
+ You are a helpful AI assistant.
3
+ <|end|>
4
+ <|user|>
5
+ {{prompt}}
6
+ <|end|>
7
+ <|assistant|>
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ llama-cpp-python
setup_model.sh ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ # Default URL (Phi-3 Mini 4k Instruct)
5
+ DEFAULT_URL="https://huggingface.co/prithivMLmods/Nanbeige4.1-3B-f32-GGUF?show_file_info=Nanbeige4.1-3B.Q8_0.gguf"
6
+ MODEL_URL=${1:-$DEFAULT_URL}
7
+ MODEL_DIR="model"
8
+ ENGINE_DIR="engine"
9
+
10
+ echo ">>> Setting up Ultra-Fast LLM..."
11
+
12
+ # 1. Setup Model Directory
13
+ if [ ! -d "$MODEL_DIR" ]; then
14
+ mkdir -p "$MODEL_DIR"
15
+ fi
16
+
17
+ # 2. Download Model
18
+ if [ ! -f "$MODEL_DIR/model.gguf" ]; then
19
+ echo ">>> Downloading model from $MODEL_URL..."
20
+ wget -O "$MODEL_DIR/model.gguf" "$MODEL_URL" --show-progress
21
+ else
22
+ echo ">>> Model already exists using existing..."
23
+ fi
24
+
25
+ # 3. Create Template (Phi-3 default)
26
+ if [ ! -f "$MODEL_DIR/template.txt" ]; then
27
+ echo "<|user|>\n{{prompt}}<|end|>\n<|assistant|>" > "$MODEL_DIR/template.txt"
28
+ echo ">>> Created default Phi-3 template."
29
+ fi
30
+
31
+ # 4. Check for Virtual Environment
32
+ if [ ! -d ".venv" ]; then
33
+ echo ">>> Creating Python virtual environment..."
34
+ python3 -m venv .venv
35
+ source .venv/bin/activate
36
+ pip install -r requirements.txt
37
+ else
38
+ source .venv/bin/activate
39
+ fi
40
+
41
+ # 5. Compile Engine
42
+ echo ">>> Compiling C++ Optimization Engine..."
43
+ if [ -d "$ENGINE_DIR" ]; then
44
+ cd "$ENGINE_DIR"
45
+ g++ -O2 -march=native -shared -fPIC -o libbatch.so batch_server.cpp \
46
+ -I"../.venv/lib/python3.12/site-packages/include" \
47
+ -L"../.venv/lib/python3.12/site-packages/llama_cpp/lib" \
48
+ -lllama -Wl,-rpath,"../.venv/lib/python3.12/site-packages/llama_cpp/lib"
49
+ cd ..
50
+ else
51
+ echo "Error: engine directory not found!"
52
+ exit 1
53
+ fi
54
+
55
+ echo ">>> Setup Complete!"
56
+ echo "Run server with: cd api && ../.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000"