Spaces:

Kalpokoch
/

ChatbotDemo

Running

App Files Files

Kalpokoch commited on Aug 4

Commit

fe6354c

verified ·

1 Parent(s): b636490

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +22 -27

app/app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Complete and final app.py
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
@@ -53,19 +51,17 @@ else:
     logger.info("[INFO] Vector DB ready.")
 # -----------------------------
-# ✅ Load Your GGUF Model
 # -----------------------------
-# <-- UPDATED: Points to the new local model file downloaded in the Dockerfile
-MODEL_PATH = "/app/dop-phi-1.5-Q4_K_M.gguf"
 logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
 llm = Llama(
     model_path=MODEL_PATH,
-    n_ctx=2048,
-    n_threads=2,
-    n_gpu_layers=0,
     verbose=False
 )
 logger.info("[INFO] Model loaded successfully.")
@@ -79,12 +75,12 @@ class Query(BaseModel):
 # -----------------------------
 # ✅ Chat Endpoint
 # -----------------------------
-LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "45"))
-logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds.")
 async def generate_llm_response(prompt: str):
     """Helper function to run synchronous LLM inference."""
-    response = llm(prompt, max_tokens=384, stop=["Instruct:", "Output:", "###"], temperature=0.2, echo=False)
     answer = response["choices"][0]["text"].strip()
     if not answer:
         raise ValueError("Empty response from LLM")
@@ -102,34 +98,33 @@ async def chat(query: Query):
         reverse=True
     )
     if not filtered:
         logger.info("[RESPONSE] No relevant context found.")
         return {
             "question": question,
-            "context_used": "No relevant context found.",
-            "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
         }
     context = filtered[0]["text"]
-    logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f})")
-    # This prompt format matches how you fine-tuned Phi-1.5
-    prompt = f"""Instruct: Use the following context to answer the question.
-Context: {context}
-Question: {question}
-Output:"""
-    answer = "Sorry, I couldn't process your request right now. Please try again later."
     try:
         answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
     except asyncio.TimeoutError:
-        logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
-        answer = "Sorry, the request took too long to process. Please try again with a simpler question."
     except Exception as e:
-        logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)}")
-        answer = "Sorry, an unexpected error occurred while generating a response."
-    logger.info(f"[RESPONSE] Answered: {answer[:100]}...")
     return {
         "question": question,
         "context_used": context,

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
     logger.info("[INFO] Vector DB ready.")
 # -----------------------------
+# ✅ Load TinyLlama GGUF Model
 # -----------------------------
+MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
 logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
 llm = Llama(
     model_path=MODEL_PATH,
+    n_ctx=512,
+    n_threads=1,
+    n_batch=8,
+    use_mlock=False,
     verbose=False
 )
 logger.info("[INFO] Model loaded successfully.")
 # -----------------------------
 # ✅ Chat Endpoint
 # -----------------------------
+LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30"))
+logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds at startup.")
 async def generate_llm_response(prompt: str):
     """Helper function to run synchronous LLM inference."""
+    response = llm(prompt, max_tokens=384, stop=["###"], temperature=0.2)
     answer = response["choices"][0]["text"].strip()
     if not answer:
         raise ValueError("Empty response from LLM")
         reverse=True
     )
+    for i, r in enumerate(search_results):
+        logger.info(f"[DEBUG] Chunk {i+1} | Score: {r['relevance_score']:.4f} | Snippet: {r['text'][:80]}...")
     if not filtered:
         logger.info("[RESPONSE] No relevant context found.")
         return {
             "question": question,
+            "context_used": "No relevant context found above the relevance threshold.",
+            "answer": "Sorry, I need more detail in the question to provide an answer based on the policies."
         }
     context = filtered[0]["text"]
+    logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f}): {context[:100]}...")
+    prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
+    answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
     try:
         answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
     except asyncio.TimeoutError:
+        logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds for question: '{question}'")
+        answer = "Sorry, I couldn't process your request right now. The operation took too long. Please try again later or rephrase your question."
     except Exception as e:
+        logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)} for question: '{question}'")
+        answer = "Sorry, I couldn't process your request right now. An unexpected error occurred. Please try again later or rephrase your question."
+    logger.info(f"[RESPONSE] Answered for '{question}': {answer[:100]}...")
     return {
         "question": question,
         "context_used": context,