Spaces:

omaryasserhassan
/

llm_server

Sleeping

File size: 1,500 Bytes

0f0af70
 
 
 
c06971b
0f0af70
 
c06971b
b5dfa0f
0f0af70
 
c06971b
0f0af70
 
 
 
 
 
b5dfa0f
c06971b
0f0af70
 
 
 
 
c06971b
 
 
 
 
 
0f0af70
 
 
 
 
c06971b
0f0af70
 
 
 
 
 
 
 
c06971b
 
 
0f0af70
c06971b
8b2ac0c
c06971b
b5dfa0f

import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

REPO_ID  = "bartowski/Llama-3.2-3B-Instruct-GGUF"
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
CACHE_DIR = "/app/models"  # matches Dockerfile pre-download
os.makedirs(CACHE_DIR, exist_ok=True)

app = FastAPI()
_model = None

def get_model():
    global _model
    if _model is not None:
        return _model

    local_path = hf_hub_download(
        repo_id=REPO_ID,
        filename=FILENAME,
        cache_dir=CACHE_DIR,
        local_dir_use_symlinks=False,
    )
    _model = Llama(
        model_path=local_path,
        n_ctx=2048,
        n_threads=os.cpu_count() or 2,
        n_batch=256,
        verbose=False
    )
    return _model

class PromptRequest(BaseModel):
    prompt: str
    max_tokens: int = 256
    temperature: float = 0.7

@app.post("/generate")
def generate_text(req: PromptRequest):
    try:
        model = get_model()
        output = model(
            req.prompt,
            max_tokens=req.max_tokens,
            temperature=req.temperature,
            stop=["</s>"]
        )
        return {"ok": True, "response": output["choices"][0]["text"]}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
def health():
    try:
        _ = get_model()
        return {"ok": True}
    except Exception as e:
        return {"ok": False, "error": str(e)}