Spaces:
Sleeping
Sleeping
File size: 1,500 Bytes
0f0af70 c06971b 0f0af70 c06971b b5dfa0f 0f0af70 c06971b 0f0af70 b5dfa0f c06971b 0f0af70 c06971b 0f0af70 c06971b 0f0af70 c06971b 0f0af70 c06971b 8b2ac0c c06971b b5dfa0f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
CACHE_DIR = "/app/models" # matches Dockerfile pre-download
os.makedirs(CACHE_DIR, exist_ok=True)
app = FastAPI()
_model = None
def get_model():
global _model
if _model is not None:
return _model
local_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
cache_dir=CACHE_DIR,
local_dir_use_symlinks=False,
)
_model = Llama(
model_path=local_path,
n_ctx=2048,
n_threads=os.cpu_count() or 2,
n_batch=256,
verbose=False
)
return _model
class PromptRequest(BaseModel):
prompt: str
max_tokens: int = 256
temperature: float = 0.7
@app.post("/generate")
def generate_text(req: PromptRequest):
try:
model = get_model()
output = model(
req.prompt,
max_tokens=req.max_tokens,
temperature=req.temperature,
stop=["</s>"]
)
return {"ok": True, "response": output["choices"][0]["text"]}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
def health():
try:
_ = get_model()
return {"ok": True}
except Exception as e:
return {"ok": False, "error": str(e)}
|