ChatbotDemo / app /app.py
Kalpokoch's picture
Update app/app.py
b98ca48 verified
raw
history blame
1.85 kB
# =======================
# ⚑ GGUF + llama-cpp-python FastAPI App for HF Spaces (CPU Optimized)
# =======================
from fastapi import FastAPI, Request
from pydantic import BaseModel
from llama_cpp import Llama
import os
from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
# Initialize FastAPI app
app = FastAPI()
# -----------------------------
# βœ… Vector DB Configuration
# -----------------------------
DB_PERSIST_DIRECTORY = "/app/vector_database"
CHUNKS_FILE_PATH = "/app/processed_chunks.json"
print("[INFO] Initializing vector DB...")
db = PolicyVectorDB(persist_directory=DB_PERSIST_DIRECTORY)
if not ensure_db_populated(db, CHUNKS_FILE_PATH):
print("[WARNING] DB not populated. Chunks file may be missing.")
else:
print("[INFO] Vector DB ready.")
# -----------------------------
# βœ… Load GGUF Model with llama-cpp-python (model is pre-downloaded in Dockerfile)
# -----------------------------
MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
print(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=1024,
n_threads=2,
n_batch=32,
use_mlock=False,
verbose=False
)
print("[INFO] Model loaded successfully.")
# -----------------------------
# βœ… Request Schema
# -----------------------------
class Query(BaseModel):
question: str
# -----------------------------
# βœ… Chat Endpoint
# -----------------------------
@app.post("/chat/")
async def chat(query: Query):
question = query.question
search_results = db.search(question)
context = "\n".join([res["text"] for res in search_results])
prompt = f"""### Context:\n{context}\n\n### Question: {question}\n### Answer:"""
response = llm(prompt, max_tokens=150, stop=["###"])
answer = response["choices"][0]["text"].strip()
return {"answer": answer}