Spaces:

omaryasserhassan
/

llm_server

Sleeping

llm_server / app.py

Update app.py

b5dfa0f verified 4 months ago

1.5 kB

	import os
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
	FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
	CACHE_DIR = "/app/models" # matches Dockerfile pre-download
	os.makedirs(CACHE_DIR, exist_ok=True)

	app = FastAPI()
	_model = None

	def get_model():
	global _model
	if _model is not None:
	return _model

	local_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME,
	cache_dir=CACHE_DIR,
	local_dir_use_symlinks=False,
	)
	_model = Llama(
	model_path=local_path,
	n_ctx=2048,
	n_threads=os.cpu_count() or 2,
	n_batch=256,
	verbose=False
	)
	return _model

	class PromptRequest(BaseModel):
	prompt: str
	max_tokens: int = 256
	temperature: float = 0.7

	@app.post("/generate")
	def generate_text(req: PromptRequest):
	try:
	model = get_model()
	output = model(
	req.prompt,
	max_tokens=req.max_tokens,
	temperature=req.temperature,
	stop=["</s>"]
	)
	return {"ok": True, "response": output["choices"][0]["text"]}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/health")
	def health():
	try:
	_ = get_model()
	return {"ok": True}
	except Exception as e:
	return {"ok": False, "error": str(e)}