Spaces:
Sleeping
Sleeping
File size: 2,685 Bytes
ccef136 5f65aa8 0f0af70 ccef136 490925d 0f0af70 c06971b 0f0af70 5f65aa8 730089e 5f65aa8 730089e ccef136 490925d 730089e 490925d 730089e 33f19bf 5f65aa8 490925d 730089e b5dfa0f 490925d a510b92 490925d 730089e 5f65aa8 490925d 730089e a510b92 33f19bf 730089e ccef136 0f0af70 730089e 5f65aa8 730089e 71ab992 33f19bf 490925d 5f65aa8 490925d 5f65aa8 490925d 5f65aa8 490925d 5f65aa8 490925d 730089e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# app.py
import os, time
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# ---------- Minimal fixed config (fast on CPU) ----------
REPO_ID = "bartowski/Llama-3.2-1B-Instruct-GGUF" # 1B = much faster on CPU
FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
# Build-time prefetch location (Dockerfile step put model here)
BUILD_DIR = "/app/models"
MODEL_PATH = os.path.join(BUILD_DIR, FILENAME)
# Writable runtime cache if the prebuilt file isn't present
RUNTIME_CACHE = "/tmp/hf_cache"
N_THREADS = min(4, os.cpu_count() or 2)
N_BATCH = 8
N_CTX = 2048
MAX_TOKENS = 16
TEMPERATURE = 0.7
TOP_P = 0.9
STOP = ["</s>", "<|eot_id|>"]
# ---------- App ----------
app = FastAPI(title="Simple Llama Server (1B fast)")
model = None
effective_model_path = None
class PromptRequest(BaseModel):
prompt: str
@app.on_event("startup")
def load_model():
global model, effective_model_path
# 1) If the model exists from the Docker build, use it directly (no writes)
if os.path.isfile(MODEL_PATH):
effective_model_path = MODEL_PATH
else:
# 2) Otherwise, download to a writable temp cache (NOT under /app)
os.makedirs(RUNTIME_CACHE, exist_ok=True)
effective_model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
cache_dir=RUNTIME_CACHE,
local_dir_use_symlinks=False,
)
t0 = time.time()
model = Llama(
model_path=effective_model_path,
chat_format="llama-3",
n_ctx=N_CTX,
n_threads=N_THREADS,
n_batch=N_BATCH,
use_mmap=True, # faster load
n_gpu_layers=0, # CPU only
verbose=False,
)
print(f"[startup] loaded {effective_model_path} in {time.time()-t0:.2f}s")
@app.get("/health")
def health():
return {"ok": model is not None, "model_path": effective_model_path}
@app.post("/generate")
def generate(req: PromptRequest):
if model is None:
raise HTTPException(status_code=500, detail="Model not loaded")
prompt = (req.prompt or "").strip()
if not prompt:
raise HTTPException(status_code=400, detail="prompt must be non-empty")
t0 = time.time()
out = model.create_chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
top_p=TOP_P,
stop=STOP,
)
text = out["choices"][0]["message"]["content"]
return JSONResponse({"response": text, "timing_sec": round(time.time()-t0, 2)})
|