Spaces:

Kalpokoch
/

ChatbotDemo

Running

App Files Files

Kalpokoch commited on Jul 28

Commit

72e44d7

verified ·

1 Parent(s): f65ed48

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +35 -31

app/app.py CHANGED Viewed

@@ -4,12 +4,12 @@ from llama_cpp import Llama
 import logging
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 import asyncio
-import os # Import os to check for environment variables
 # -----------------------------
 # ✅ Logging Configuration
 # -----------------------------
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("app")
 # -----------------------------
@@ -46,7 +46,7 @@ db = PolicyVectorDB(
     relevance_threshold=0  # Low for more inclusive matching
 )
 if not ensure_db_populated(db, CHUNKS_FILE_PATH):
-    logger.warning("[WARNING] DB not populated. Chunks file may be missing.")
 else:
     logger.info("[INFO] Vector DB ready.")
@@ -55,11 +55,16 @@ else:
 # -----------------------------
 MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
 logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
 llm = Llama(
     model_path=MODEL_PATH,
-    n_ctx=512,              # Lower for faster inference
-    n_threads=2,
-    n_batch=16,
     use_mlock=False,
     verbose=False
 )
@@ -76,24 +81,20 @@ class Query(BaseModel):
 # -----------------------------
 # Define a reasonable timeout for LLM inference (e.g., 30 seconds)
-# You can make this configurable via environment variable if needed
 LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30")) # Default to 30 seconds
 async def generate_llm_response(prompt: str):
-    """Helper function to run LLM inference, allowing it to be awaited with a timeout."""
-    try:
-        # llama_cpp.Llama's __call__ method is synchronous.
-        # To make it awaitable for asyncio.wait_for, we can run it in a thread pool.
-        # FastAPI's async def handles this automatically by running synchronous
-        # code in a default thread pool.
-        response = llm(prompt, max_tokens=128, stop=["###"], temperature=0.2)
-        answer = response["choices"][0]["text"].strip()
-        if not answer:
-            raise ValueError("Empty response from LLM")
-        return answer
-    except Exception as e:
-        logger.error(f"[ERROR] LLM generation failed: {str(e)}")
-        raise # Re-raise to be caught by the timeout mechanism or outer try/except
 @app.post("/chat")
 async def chat(query: Query):
@@ -110,18 +111,20 @@ async def chat(query: Query):
         reverse=True
     )
-    # 🪵 Log context scores
     for i, r in enumerate(search_results):
-        logger.info(f"[DEBUG] Chunk {i+1} | Score: {r['relevance_score']:.4f} | Snippet: {r['text'][:80]}")
     if not filtered:
         return {
             "question": question,
             "context_used": "No relevant context found above the relevance threshold.",
-            "answer": "Sorry, I need more detail in the question to provide an answer."
         }
     context = filtered[0]["text"]
     # ✨ Prompt Template
     prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
@@ -129,18 +132,19 @@ async def chat(query: Query):
     # 🔮 Run LLM with safety and timeout
     answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
     try:
-        # We use asyncio.wait_for to enforce a timeout on the LLM generation.
-        # If generate_llm_response doesn't complete within LLM_TIMEOUT_SECONDS,
-        # asyncio.TimeoutError will be raised.
         answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
     except asyncio.TimeoutError:
-        logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds for question: {question}")
         answer = "Sorry, I couldn't process your request right now. The operation took too long. Please try again later or rephrase your question."
     except Exception as e:
-        logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)}")
-        answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
-    logger.info(f"[RESPONSE] {answer}")
     return {
         "question": question,
         "context_used": context,

 import logging
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 import asyncio
+import os
 # -----------------------------
 # ✅ Logging Configuration
 # -----------------------------
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger("app")
 # -----------------------------
     relevance_threshold=0  # Low for more inclusive matching
 )
 if not ensure_db_populated(db, CHUNKS_FILE_PATH):
+    logger.warning("[WARNING] DB not populated. Chunks file may be missing or empty. RAG will not function correctly.")
 else:
     logger.info("[INFO] Vector DB ready.")
 # -----------------------------
 MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
 logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
+# --- IMPORTANT: Experiment with these Llama parameters based on your CPU performance ---
+# n_ctx: Context window size. Adjust if your typical prompts are significantly shorter.
+# n_threads: Number of CPU threads. Experiment with 1, 2, or 4 for best results on your specific CPU.
+# n_batch: Batch size for internal processing. Experiment with 4, 8, or 16 for latency vs. throughput.
 llm = Llama(
     model_path=MODEL_PATH,
+    n_ctx=512,              # Keep at 512, or reduce if your context is always short, e.g., 384
+    n_threads=2,            # <--- EXPERIMENT HERE (try 1, 2, or 4)
+    n_batch=16,             # <--- EXPERIMENT HERE (try 4, 8, or 16)
     use_mlock=False,
     verbose=False
 )
 # -----------------------------
 # Define a reasonable timeout for LLM inference (e.g., 30 seconds)
+# This can be configured via an environment variable in Hugging Face Spaces.
 LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30")) # Default to 30 seconds
 async def generate_llm_response(prompt: str):
+    """
+    Helper function to run synchronous LLM inference.
+    FastAPI's async def automatically runs blocking code in a thread pool,
+    making it compatible with asyncio.wait_for.
+    """
+    response = llm(prompt, max_tokens=128, stop=["###"], temperature=0.2)
+    answer = response["choices"][0]["text"].strip()
+    if not answer:
+        raise ValueError("Empty response from LLM")
+    return answer
 @app.post("/chat")
 async def chat(query: Query):
         reverse=True
     )
+    # 🪵 Log context scores for debugging
     for i, r in enumerate(search_results):
+        logger.info(f"[DEBUG] Chunk {i+1} | Score: {r['relevance_score']:.4f} | Snippet: {r['text'][:80]}...")
     if not filtered:
+        logger.info("[RESPONSE] No relevant context found.")
         return {
             "question": question,
             "context_used": "No relevant context found above the relevance threshold.",
+            "answer": "Sorry, I need more detail in the question to provide an answer based on the policies."
         }
     context = filtered[0]["text"]
+    logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f}): {context[:100]}...")
     # ✨ Prompt Template
     prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
     # 🔮 Run LLM with safety and timeout
     answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
     try:
+        # Enforce timeout for LLM generation
         answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
     except asyncio.TimeoutError:
+        logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds for question: '{question}'")
         answer = "Sorry, I couldn't process your request right now. The operation took too long. Please try again later or rephrase your question."
+    except ValueError as e: # Catch explicit ValueError from generate_llm_response (e.g., empty response)
+        logger.error(f"[ERROR] LLM generation returned an invalid response: {str(e)} for question: '{question}'")
+        answer = "Sorry, I couldn't process your request right now. The model returned an invalid answer. Please try again later or rephrase your question."
     except Exception as e:
+        logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)} for question: '{question}'")
+        answer = "Sorry, I couldn't process your request right now. An unexpected error occurred. Please try again later or rephrase your question."
+    logger.info(f"[RESPONSE] Answered for '{question}': {answer[:100]}...") # Log beginning of answer
     return {
         "question": question,
         "context_used": context,