Spaces:

handhandlab
/

llamacpp-service

Sleeping

llamacpp-service / llm.py

request and response

1919d23 3 months ago

779 Bytes

	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from llama_cpp import Llama

	# Initialize the LLM once when the application starts
	llm = Llama.from_pretrained(
	repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
	filename="llama-3.2-1b-instruct-q4_k_m.gguf"
	)

	app = FastAPI()

	class ChatRequest(BaseModel):
	messages: list[dict]

	@app.post("/chat")
	async def chat_completion(request: ChatRequest):
	# print(request.messages)
	# test = [{"role": "user", "content": "dsfa"}]
	# print(test)
	# return "haha"
	try:
	response = llm.create_chat_completion(
	messages = request.messages
	)
	return response
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))