File size: 1,500 Bytes
0f0af70
 
 
 
c06971b
0f0af70
 
c06971b
b5dfa0f
0f0af70
 
c06971b
0f0af70
 
 
 
 
 
b5dfa0f
c06971b
0f0af70
 
 
 
 
c06971b
 
 
 
 
 
0f0af70
 
 
 
 
c06971b
0f0af70
 
 
 
 
 
 
 
c06971b
 
 
0f0af70
c06971b
8b2ac0c
c06971b
b5dfa0f
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

REPO_ID  = "bartowski/Llama-3.2-3B-Instruct-GGUF"
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
CACHE_DIR = "/app/models"  # matches Dockerfile pre-download
os.makedirs(CACHE_DIR, exist_ok=True)

app = FastAPI()
_model = None

def get_model():
    global _model
    if _model is not None:
        return _model

    local_path = hf_hub_download(
        repo_id=REPO_ID,
        filename=FILENAME,
        cache_dir=CACHE_DIR,
        local_dir_use_symlinks=False,
    )
    _model = Llama(
        model_path=local_path,
        n_ctx=2048,
        n_threads=os.cpu_count() or 2,
        n_batch=256,
        verbose=False
    )
    return _model

class PromptRequest(BaseModel):
    prompt: str
    max_tokens: int = 256
    temperature: float = 0.7

@app.post("/generate")
def generate_text(req: PromptRequest):
    try:
        model = get_model()
        output = model(
            req.prompt,
            max_tokens=req.max_tokens,
            temperature=req.temperature,
            stop=["</s>"]
        )
        return {"ok": True, "response": output["choices"][0]["text"]}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
def health():
    try:
        _ = get_model()
        return {"ok": True}
    except Exception as e:
        return {"ok": False, "error": str(e)}