OpenTransformer's picture
Add files using upload-large-folder tool
19ed98b verified
#!/usr/bin/env python3
"""
OpenAI-compatible API server for Ternary Transformer Engine.
Drop-in replacement for llama-server.
(c) 2026 OpenTransformers Ltd / Scott Bisset
"""
import json
import time
import threading
from http.server import HTTPServer, BaseHTTPRequestHandler
from inference import TernaryQwen, Tokenizer, load_kernel
import os
MODEL_DIR = os.environ.get("TERNARY_MODEL_DIR", "deepseek-r1-1.5b-ternary")
TOKENIZER_DIR = os.environ.get("TOKENIZER_DIR", "deepseek-r1-1.5b-hf")
HOST = os.environ.get("HOST", "127.0.0.1")
PORT = int(os.environ.get("PORT", "8080"))
print("Loading ternary kernel...")
kernel = load_kernel(os.path.join(os.path.dirname(__file__), "ternary_kernel.so"))
print(f"Loading model from {MODEL_DIR}...")
model = TernaryQwen(MODEL_DIR, kernel)
print(f"Loading tokenizer from {TOKENIZER_DIR}...")
tokenizer = Tokenizer(TOKENIZER_DIR)
lock = threading.Lock()
print("Ready!")
class Handler(BaseHTTPRequestHandler):
def do_POST(self):
if self.path == "/v1/chat/completions":
length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length))
messages = body.get("messages", [])
max_tokens = body.get("max_tokens", 256)
temperature = body.get("temperature", 0.6)
top_p = body.get("top_p", 0.95)
# Build prompt
prompt = tokenizer.apply_chat_template(messages)
input_ids = tokenizer.encode(prompt)
# Generate
with lock:
gen_ids, stats = model.generate(
input_ids,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p
)
text = tokenizer.decode(gen_ids)
response = {
"id": f"chatcmpl-ternary-{int(time.time())}",
"object": "chat.completion",
"created": int(time.time()),
"model": "DeepSeek-R1-Distill-Qwen-1.5B-TERNARY",
"choices": [{
"index": 0,
"message": {"role": "assistant", "content": text},
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": len(input_ids),
"completion_tokens": stats["tokens_generated"],
"total_tokens": len(input_ids) + stats["tokens_generated"]
},
"timings": {
"prompt_n": stats["prefill_tokens"],
"prompt_ms": stats["prefill_ms"],
"predicted_n": stats["tokens_generated"],
"predicted_ms": stats["decode_ms"],
"predicted_per_second": stats["tok_per_sec"],
}
}
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(response).encode())
else:
self.send_response(404)
self.end_headers()
def do_GET(self):
if self.path == "/health":
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(b'{"status":"ok","engine":"ternary-avx512"}')
else:
self.send_response(404)
self.end_headers()
def log_message(self, format, *args):
pass
if __name__ == "__main__":
server = HTTPServer((HOST, PORT), Handler)
print(f"Ternary engine serving on {HOST}:{PORT}")
server.serve_forever()