Spaces:
Sleeping
Sleeping
| # app.py | |
| import os, time | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import JSONResponse | |
| from pydantic import BaseModel | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| # ---------- Minimal fixed config (fast on CPU) ---------- | |
| REPO_ID = "bartowski/Llama-3.2-1B-Instruct-GGUF" # 1B = much faster on CPU | |
| FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf" | |
| # Build-time prefetch location (Dockerfile step put model here) | |
| BUILD_DIR = "/app/models" | |
| MODEL_PATH = os.path.join(BUILD_DIR, FILENAME) | |
| # Writable runtime cache if the prebuilt file isn't present | |
| RUNTIME_CACHE = "/tmp/hf_cache" | |
| N_THREADS = min(4, os.cpu_count() or 2) | |
| N_BATCH = 8 | |
| N_CTX = 2048 | |
| MAX_TOKENS = 16 | |
| TEMPERATURE = 0.7 | |
| TOP_P = 0.9 | |
| STOP = ["</s>", "<|eot_id|>"] | |
| # ---------- App ---------- | |
| app = FastAPI(title="Simple Llama Server (1B fast)") | |
| model = None | |
| effective_model_path = None | |
| class PromptRequest(BaseModel): | |
| prompt: str | |
| def load_model(): | |
| global model, effective_model_path | |
| # 1) If the model exists from the Docker build, use it directly (no writes) | |
| if os.path.isfile(MODEL_PATH): | |
| effective_model_path = MODEL_PATH | |
| else: | |
| # 2) Otherwise, download to a writable temp cache (NOT under /app) | |
| os.makedirs(RUNTIME_CACHE, exist_ok=True) | |
| effective_model_path = hf_hub_download( | |
| repo_id=REPO_ID, | |
| filename=FILENAME, | |
| cache_dir=RUNTIME_CACHE, | |
| local_dir_use_symlinks=False, | |
| ) | |
| t0 = time.time() | |
| model = Llama( | |
| model_path=effective_model_path, | |
| chat_format="llama-3", | |
| n_ctx=N_CTX, | |
| n_threads=N_THREADS, | |
| n_batch=N_BATCH, | |
| use_mmap=True, # faster load | |
| n_gpu_layers=0, # CPU only | |
| verbose=False, | |
| ) | |
| print(f"[startup] loaded {effective_model_path} in {time.time()-t0:.2f}s") | |
| def health(): | |
| return {"ok": model is not None, "model_path": effective_model_path} | |
| def generate(req: PromptRequest): | |
| if model is None: | |
| raise HTTPException(status_code=500, detail="Model not loaded") | |
| prompt = (req.prompt or "").strip() | |
| if not prompt: | |
| raise HTTPException(status_code=400, detail="prompt must be non-empty") | |
| t0 = time.time() | |
| out = model.create_chat_completion( | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=MAX_TOKENS, | |
| temperature=TEMPERATURE, | |
| top_p=TOP_P, | |
| stop=STOP, | |
| ) | |
| text = out["choices"][0]["message"]["content"] | |
| return JSONResponse({"response": text, "timing_sec": round(time.time()-t0, 2)}) | |