Spaces:
Running
Running
| # ======================= | |
| # β‘ GGUF + llama-cpp-python FastAPI App for HF Spaces (CPU Optimized) | |
| # ======================= | |
| from fastapi import FastAPI, Request | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| import os | |
| from app.policy_vector_db import PolicyVectorDB, ensure_db_populated | |
| # Initialize FastAPI app | |
| app = FastAPI() | |
| # ----------------------------- | |
| # β Vector DB Configuration | |
| # ----------------------------- | |
| DB_PERSIST_DIRECTORY = "/app/vector_database" | |
| CHUNKS_FILE_PATH = "/app/processed_chunks.json" | |
| print("[INFO] Initializing vector DB...") | |
| db = PolicyVectorDB(persist_directory=DB_PERSIST_DIRECTORY) | |
| if not ensure_db_populated(db, CHUNKS_FILE_PATH): | |
| print("[WARNING] DB not populated. Chunks file may be missing.") | |
| else: | |
| print("[INFO] Vector DB ready.") | |
| # ----------------------------- | |
| # β Load GGUF Model with llama-cpp-python (model is pre-downloaded in Dockerfile) | |
| # ----------------------------- | |
| MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf" | |
| print(f"[INFO] Loading GGUF model from: {MODEL_PATH}") | |
| llm = Llama( | |
| model_path=MODEL_PATH, | |
| n_ctx=1024, | |
| n_threads=2, | |
| n_batch=32, | |
| use_mlock=False, | |
| verbose=False | |
| ) | |
| print("[INFO] Model loaded successfully.") | |
| # ----------------------------- | |
| # β Request Schema | |
| # ----------------------------- | |
| class Query(BaseModel): | |
| question: str | |
| # ----------------------------- | |
| # β Chat Endpoint | |
| # ----------------------------- | |
| async def chat(query: Query): | |
| question = query.question | |
| search_results = db.search(question) | |
| context = "\n".join([res["text"] for res in search_results]) | |
| prompt = f"""### Context:\n{context}\n\n### Question: {question}\n### Answer:""" | |
| response = llm(prompt, max_tokens=150, stop=["###"]) | |
| answer = response["choices"][0]["text"].strip() | |
| return {"answer": answer} | |