# ======================= # ⚡ GGUF + llama-cpp-python FastAPI App for HF Spaces (CPU Optimized) # ======================= from fastapi import FastAPI, Request from pydantic import BaseModel from llama_cpp import Llama import os from app.policy_vector_db import PolicyVectorDB, ensure_db_populated # Initialize FastAPI app app = FastAPI() # ----------------------------- # ✅ Vector DB Configuration # ----------------------------- DB_PERSIST_DIRECTORY = "/app/vector_database" CHUNKS_FILE_PATH = "/app/processed_chunks.json" print("[INFO] Initializing vector DB...") db = PolicyVectorDB(persist_directory=DB_PERSIST_DIRECTORY) if not ensure_db_populated(db, CHUNKS_FILE_PATH): print("[WARNING] DB not populated. Chunks file may be missing.") else: print("[INFO] Vector DB ready.") # ----------------------------- # ✅ Load GGUF Model with llama-cpp-python (model is pre-downloaded in Dockerfile) # ----------------------------- MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf" print(f"[INFO] Loading GGUF model from: {MODEL_PATH}") llm = Llama( model_path=MODEL_PATH, n_ctx=1024, n_threads=2, n_batch=32, use_mlock=False, verbose=False ) print("[INFO] Model loaded successfully.") # ----------------------------- # ✅ Request Schema # ----------------------------- class Query(BaseModel): question: str # ----------------------------- # ✅ Chat Endpoint # ----------------------------- @app.post("/chat/") async def chat(query: Query): question = query.question search_results = db.search(question) context = "\n".join([res["text"] for res in search_results]) prompt = f"""### Context:\n{context}\n\n### Question: {question}\n### Answer:""" response = llm(prompt, max_tokens=150, stop=["###"]) answer = response["choices"][0]["text"].strip() return {"answer": answer}