Spaces:
Sleeping
Sleeping
| import os | |
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF" | |
| FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" | |
| CACHE_DIR = "/app/models" # matches Dockerfile pre-download | |
| os.makedirs(CACHE_DIR, exist_ok=True) | |
| app = FastAPI() | |
| _model = None | |
| def get_model(): | |
| global _model | |
| if _model is not None: | |
| return _model | |
| local_path = hf_hub_download( | |
| repo_id=REPO_ID, | |
| filename=FILENAME, | |
| cache_dir=CACHE_DIR, | |
| local_dir_use_symlinks=False, | |
| ) | |
| _model = Llama( | |
| model_path=local_path, | |
| n_ctx=2048, | |
| n_threads=os.cpu_count() or 2, | |
| n_batch=256, | |
| verbose=False | |
| ) | |
| return _model | |
| class PromptRequest(BaseModel): | |
| prompt: str | |
| max_tokens: int = 256 | |
| temperature: float = 0.7 | |
| def generate_text(req: PromptRequest): | |
| try: | |
| model = get_model() | |
| output = model( | |
| req.prompt, | |
| max_tokens=req.max_tokens, | |
| temperature=req.temperature, | |
| stop=["</s>"] | |
| ) | |
| return {"ok": True, "response": output["choices"][0]["text"]} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| def health(): | |
| try: | |
| _ = get_model() | |
| return {"ok": True} | |
| except Exception as e: | |
| return {"ok": False, "error": str(e)} | |