Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Jul 26, 2025

Commit

9dfc4a0

verified ·

1 Parent(s): 39c476b

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +18 -20

app/app.py CHANGED Viewed

@@ -3,33 +3,37 @@ from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-from app.policy_vector_db import PolicyVectorDB # Import your class
 # --- 1. Initialize the Vector Database and LLM ---
-# Load the vector database.
-# This connects to the persistent ChromaDB storage created by policy_vector_db.py
 print("Loading Vector Database...")
-db = PolicyVectorDB(persist_directory="../policy_vector_db")
 print("Vector Database loaded successfully!")
-# Load your fine-tuned model from Hugging Face Hub
-model_id = "Kalpokoch/QuntizedTinyLama"
 print(f"Loading model: {model_id}...")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    torch_dtype=torch.bfloat16,
     device_map="auto"
 )
-# Create a text-generation pipeline for the LLM
 pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
     max_new_tokens=256
 )
 print("LLM and pipeline loaded successfully!")
@@ -44,13 +48,16 @@ app.add_middleware(
     allow_headers=["*"],
 )
 @app.get("/")
 def read_root():
-    return {"message": "RAG chatbot backend is running with Kalpokoch/QuntizedTinyLama and ChromaDB!"}
 class ChatRequest(BaseModel):
     question: str
 @app.post("/chat")
 def chat(request: ChatRequest):
     question = request.question.strip()
@@ -58,21 +65,17 @@ def chat(request: ChatRequest):
         return {"response": "Please ask a question."}
     # --- 3. RAG Retrieval using PolicyVectorDB ---
-    # Use the search method from your class to find relevant context
     print(f"Searching for context for question: '{question}'")
     search_results = db.search(query_text=question, top_k=3)
-    # Check if any results were found
     if not search_results:
         retrieved_context = "No relevant context found."
     else:
-        # Format the retrieved documents into a single context string
         retrieved_context = "\n\n".join([result['text'] for result in search_results])
     print(f"Retrieved Context:\n{retrieved_context[:500]}...")
     # --- 4. Prompt Engineering and Generation ---
-    # Build the prompt with the retrieved context
     prompt = (
         f"<|system|>\nYou are a helpful assistant for NEEPCO policies. "
         f"Use the following context to answer the user's question. If the context doesn't contain the answer, say that.\n"
@@ -81,16 +84,11 @@ def chat(request: ChatRequest):
         f"<|assistant|>"
     )
-    # Generate a response using the pipeline
     try:
         outputs = pipe(prompt)
         reply = outputs[0]['generated_text']
-        # Extract only the assistant's newly generated reply
         assistant_reply = reply.split("<|assistant|>")[1].strip()
         return {"response": assistant_reply}
     except Exception as e:
         print(f"Error during model inference: {e}")
         return {"response": "Sorry, I encountered an error while generating a response."}

 from pydantic import BaseModel
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from app.policy_vector_db import PolicyVectorDB  # Import your class
 # --- 1. Initialize the Vector Database and LLM ---
+# Load the vector database from /tmp (safest in Docker/HF Spaces)
 print("Loading Vector Database...")
+db = PolicyVectorDB(persist_directory="/tmp/policy_vector_db")
 print("Vector Database loaded successfully!")
+# Load your quantized model from Hugging Face Hub
+model_id = "Kalpokoch/QuantizedTinyLlama"  # Correct spelling assumed
 print(f"Loading model: {model_id}...")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Choose dtype depending on device support
+dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype=dtype,
     device_map="auto"
 )
+# Create a text-generation pipeline
 pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
     max_new_tokens=256
 )
 print("LLM and pipeline loaded successfully!")
     allow_headers=["*"],
 )
 @app.get("/")
 def read_root():
+    return {"message": "RAG chatbot backend is running with Kalpokoch/QuantizedTinyLlama and ChromaDB!"}
 class ChatRequest(BaseModel):
     question: str
 @app.post("/chat")
 def chat(request: ChatRequest):
     question = request.question.strip()
         return {"response": "Please ask a question."}
     # --- 3. RAG Retrieval using PolicyVectorDB ---
     print(f"Searching for context for question: '{question}'")
     search_results = db.search(query_text=question, top_k=3)
     if not search_results:
         retrieved_context = "No relevant context found."
     else:
         retrieved_context = "\n\n".join([result['text'] for result in search_results])
     print(f"Retrieved Context:\n{retrieved_context[:500]}...")
     # --- 4. Prompt Engineering and Generation ---
     prompt = (
         f"<|system|>\nYou are a helpful assistant for NEEPCO policies. "
         f"Use the following context to answer the user's question. If the context doesn't contain the answer, say that.\n"
         f"<|assistant|>"
     )
     try:
         outputs = pipe(prompt)
         reply = outputs[0]['generated_text']
         assistant_reply = reply.split("<|assistant|>")[1].strip()
         return {"response": assistant_reply}
     except Exception as e:
         print(f"Error during model inference: {e}")
         return {"response": "Sorry, I encountered an error while generating a response."}