| | |
| | """ |
| | OpenAI-compatible API server for Ternary Transformer Engine. |
| | Drop-in replacement for llama-server. |
| | |
| | (c) 2026 OpenTransformers Ltd / Scott Bisset |
| | """ |
| |
|
| | import json |
| | import time |
| | import threading |
| | from http.server import HTTPServer, BaseHTTPRequestHandler |
| | from inference import TernaryQwen, Tokenizer, load_kernel |
| | import os |
| |
|
| | MODEL_DIR = os.environ.get("TERNARY_MODEL_DIR", "deepseek-r1-1.5b-ternary") |
| | TOKENIZER_DIR = os.environ.get("TOKENIZER_DIR", "deepseek-r1-1.5b-hf") |
| | HOST = os.environ.get("HOST", "127.0.0.1") |
| | PORT = int(os.environ.get("PORT", "8080")) |
| |
|
| | print("Loading ternary kernel...") |
| | kernel = load_kernel(os.path.join(os.path.dirname(__file__), "ternary_kernel.so")) |
| |
|
| | print(f"Loading model from {MODEL_DIR}...") |
| | model = TernaryQwen(MODEL_DIR, kernel) |
| |
|
| | print(f"Loading tokenizer from {TOKENIZER_DIR}...") |
| | tokenizer = Tokenizer(TOKENIZER_DIR) |
| |
|
| | lock = threading.Lock() |
| | print("Ready!") |
| |
|
| | class Handler(BaseHTTPRequestHandler): |
| | def do_POST(self): |
| | if self.path == "/v1/chat/completions": |
| | length = int(self.headers.get("Content-Length", 0)) |
| | body = json.loads(self.rfile.read(length)) |
| | |
| | messages = body.get("messages", []) |
| | max_tokens = body.get("max_tokens", 256) |
| | temperature = body.get("temperature", 0.6) |
| | top_p = body.get("top_p", 0.95) |
| | |
| | |
| | prompt = tokenizer.apply_chat_template(messages) |
| | input_ids = tokenizer.encode(prompt) |
| | |
| | |
| | with lock: |
| | gen_ids, stats = model.generate( |
| | input_ids, |
| | max_new_tokens=max_tokens, |
| | temperature=temperature, |
| | top_p=top_p |
| | ) |
| | |
| | text = tokenizer.decode(gen_ids) |
| | |
| | response = { |
| | "id": f"chatcmpl-ternary-{int(time.time())}", |
| | "object": "chat.completion", |
| | "created": int(time.time()), |
| | "model": "DeepSeek-R1-Distill-Qwen-1.5B-TERNARY", |
| | "choices": [{ |
| | "index": 0, |
| | "message": {"role": "assistant", "content": text}, |
| | "finish_reason": "stop" |
| | }], |
| | "usage": { |
| | "prompt_tokens": len(input_ids), |
| | "completion_tokens": stats["tokens_generated"], |
| | "total_tokens": len(input_ids) + stats["tokens_generated"] |
| | }, |
| | "timings": { |
| | "prompt_n": stats["prefill_tokens"], |
| | "prompt_ms": stats["prefill_ms"], |
| | "predicted_n": stats["tokens_generated"], |
| | "predicted_ms": stats["decode_ms"], |
| | "predicted_per_second": stats["tok_per_sec"], |
| | } |
| | } |
| | |
| | self.send_response(200) |
| | self.send_header("Content-Type", "application/json") |
| | self.end_headers() |
| | self.wfile.write(json.dumps(response).encode()) |
| | else: |
| | self.send_response(404) |
| | self.end_headers() |
| | |
| | def do_GET(self): |
| | if self.path == "/health": |
| | self.send_response(200) |
| | self.send_header("Content-Type", "application/json") |
| | self.end_headers() |
| | self.wfile.write(b'{"status":"ok","engine":"ternary-avx512"}') |
| | else: |
| | self.send_response(404) |
| | self.end_headers() |
| | |
| | def log_message(self, format, *args): |
| | pass |
| |
|
| | if __name__ == "__main__": |
| | server = HTTPServer((HOST, PORT), Handler) |
| | print(f"Ternary engine serving on {HOST}:{PORT}") |
| | server.serve_forever() |
| |
|