Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 22

Commit

85c3f24

verified ·

1 Parent(s): f13ef99

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +58 -262

app/app.py CHANGED Viewed

@@ -3,47 +3,39 @@ import json
 import asyncio
 import logging
 import uuid
 from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
-from typing import Optional, Dict
 from llama_cpp import Llama
 # Correctly reference the module within the 'app' package
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 # -----------------------------
 # ✅ Logging Configuration
 # -----------------------------
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] - %(message)s')
 class RequestIdAdapter(logging.LoggerAdapter):
     def process(self, msg, kwargs):
-        return '[%s] %s' % (self.extra.get('request_id', 'N/A'), msg), kwargs
 logger = logging.getLogger("app")
 # -----------------------------
 # ✅ Configuration
 # -----------------------------
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
 CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
-LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "90"))
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
 TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3"))
 TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "1"))
-LLM_THREADS = int(os.getenv("LLM_THREADS", "4"))  # configurable threads
 # -----------------------------
 # ✅ Initialize FastAPI App
 # -----------------------------
-app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.1.1")
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
@@ -53,7 +45,6 @@ async def add_request_id(request: Request, call_next):
     response.headers["X-Request-ID"] = request_id
     return response
 # -----------------------------
 # ✅ Vector DB and Data Initialization
 # -----------------------------
@@ -64,33 +55,29 @@ try:
         top_k_default=TOP_K_SEARCH,
         relevance_threshold=RELEVANCE_THRESHOLD
     )
     if not ensure_db_populated(db, CHUNKS_FILE_PATH):
         logger.warning("DB not populated on startup. RAG will not function correctly.")
         db_ready = False
     else:
         logger.info("Vector DB is populated and ready.")
         db_ready = True
 except Exception as e:
     logger.error(f"FATAL: Failed to initialize Vector DB: {e}", exc_info=True)
     db = None
     db_ready = False
 # -----------------------------
-# ✅ Load TinyLlama GGUF Model with Safer Generation
 # -----------------------------
 logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
         n_ctx=2048,
-        n_threads=LLM_THREADS,
-        n_batch=256,
         use_mlock=True,
-        verbose=False,
-        seed=42
     )
     logger.info("GGUF model loaded successfully.")
     model_ready = True
@@ -99,21 +86,12 @@ except Exception as e:
     llm = None
     model_ready = False
 # -----------------------------
 # ✅ API Schemas
 # -----------------------------
 class Query(BaseModel):
     question: str
-class AdvancedQuery(BaseModel):
-    question: str
-    section_filter: Optional[str] = None
-    chunk_type_filter: Optional[str] = None
-    top_k: Optional[int] = None
 class Feedback(BaseModel):
     request_id: str
     question: str
@@ -122,136 +100,16 @@ class Feedback(BaseModel):
     feedback: str
     comment: str | None = None
 # -----------------------------
-# ✅ Helper Functions
 # -----------------------------
 def get_logger_adapter(request: Request):
     return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
-def get_chunk_priority(chunk: Dict) -> int:
-    """Assign priority to different chunk types for better context selection"""
-    priority_order = [
-        'approval_authority',
-        'delegation_summary',
-        'requirement',
-        'method_specific',
-        'board_approval',
-        'financial_concurrence',
-        'composition'
-    ]
-    chunk_type = chunk['metadata'].get('chunk_type', 'unknown')
-    try:
-        return priority_order.index(chunk_type)
-    except ValueError:
-        return len(priority_order)  # Lower priority for unknown types
-def detect_filters(question_lower: str) -> tuple:
-    """Detect section and chunk type filters from user question"""
-    section_filter = None
-    chunk_type_filter = None
-    # Section keyword mapping
-    section_keywords = {
-        "annexure": "Annexure A",
-        "financial concurrence": "Financial Concurrence",
-        "guidelines": "Guidelines",
-        "section 1": "I", "section i": "I",
-        "section 2": "II", "section ii": "II",
-        "section 3": "III", "section iii": "III",
-        "section 4": "IV", "section iv": "IV"
-    }
-    # Chunk type keyword mapping
-    chunk_type_keywords = {
-        "approval": "approval_authority",
-        "delegation": "delegation_summary",
-        "requirement": "requirement",
-        "method": "method_specific",
-        "board": "board_approval",
-        "committee": "composition"
-    }
-    for keyword, section in section_keywords.items():
-        if keyword in question_lower:
-            section_filter = section
-            break
-    for keyword, chunk_type in chunk_type_keywords.items():
-        if keyword in question_lower:
-            chunk_type_filter = chunk_type
-            break
-    return section_filter, chunk_type_filter
-def clean_llm_response(raw_response: str) -> str:
-    """Simplified cleaner to avoid over-trimming."""
-    if not raw_response:
-        return ""
-    return raw_response.strip()
-async def generate_llm_response(prompt: str, request_id: str, adapter: RequestIdAdapter):
-    """LLM response generation with safer stops and robust extraction."""
-    loop = asyncio.get_running_loop()
-    # Use plain completion configs without fragile stop tokens
-    generation_configs = [
-        {"max_tokens": 512, "temperature": 0.2, "top_p": 0.9, "repeat_penalty": 1.1, "stop": []},
-        {"max_tokens": 384, "temperature": 0.3, "top_p": 0.9, "repeat_penalty": 1.1, "stop": []},
-        {"max_tokens": 256, "temperature": 0.4, "top_p": 0.9, "repeat_penalty": 1.1, "stop": []},
-    ]
-    for attempt, config in enumerate(generation_configs, 1):
-        try:
-            adapter.info(f"LLM generation attempt {attempt}/{len(generation_configs)} with config: {config}")
-            response = await loop.run_in_executor(
-                None,
-                lambda: llm(prompt, echo=False, **config)
-            )
-            # Debug: log a truncated snapshot of the raw response
-            try:
-                adapter.info(f"Raw LLM response object (truncated): {json.dumps(response)[:1200]}")
-            except Exception:
-                pass
-            raw_answer = ""
-            if isinstance(response, dict) and "choices" in response and response["choices"]:
-                choice = response["choices"][0]
-                if isinstance(choice, dict):
-                    raw_answer = choice.get("text") or choice.get("message", {}).get("content", "") or ""
-            cleaned_answer = clean_llm_response(raw_answer)
-            adapter.info(f"Attempt {attempt} - Raw response length: {len(raw_answer)}, Cleaned length: {len(cleaned_answer)}")
-            # Accept concise answers
-            if cleaned_answer and len(cleaned_answer.strip()) > 3:
-                adapter.info(f"Successful generation on attempt {attempt}")
-                return cleaned_answer
-            else:
-                adapter.warning(f"Attempt {attempt} produced insufficient response: '{cleaned_answer}'")
-        except Exception as e:
-            adapter.error(f"Attempt {attempt} failed: {e}", exc_info=True)
-            continue
-    adapter.error("All LLM generation attempts failed")
-    raise ValueError("Unable to generate a meaningful response after multiple attempts")
-# -----------------------------
-# ✅ Endpoints
-# -----------------------------
 @app.get("/")
 async def root():
     return {"status": "✅ Server is running."}
 @app.get("/health")
 async def health_check():
     status = {
@@ -259,17 +117,25 @@ async def health_check():
         "database_status": "ready" if db_ready else "error",
         "model_status": "ready" if model_ready else "error"
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
     return status
 @app.post("/chat")
 async def chat(query: Query, request: Request):
     adapter = get_logger_adapter(request)
-    question = query.question.strip()
-    question_lower = question.lower()
     # --- GREETING & INTRO HANDLING ---
     greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
@@ -293,99 +159,67 @@ async def chat(query: Query, request: Request):
     adapter.info(f"Received query: '{query.question}'")
-    # 1. Enhanced Search with potential filtering
-    section_filter, chunk_type_filter = detect_filters(question_lower)
-    if section_filter or chunk_type_filter:
-        adapter.info(f"Detected filters - section: '{section_filter}', chunk_type: '{chunk_type_filter}'")
-        search_results = db.search_with_filters(
-            query.question,
-            top_k=TOP_K_SEARCH,
-            section_filter=section_filter,
-            chunk_type_filter=chunk_type_filter
-        )
-        adapter.info("Used filtered search")
-    else:
-        search_results = db.search(query.question, top_k=TOP_K_SEARCH)
-        adapter.info("Used regular search")
     if not search_results:
         adapter.warning("No relevant context found in vector DB.")
         return {
-            "request_id": request.state.request_id,
             "question": query.question,
             "context_used": "No relevant context found.",
             "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
         }
-    # 2. Enhanced logging of retrieved chunks
-    chunk_types = [result['metadata'].get('chunk_type', 'unknown') for result in search_results]
-    sections = [result['metadata'].get('section', 'unknown') for result in search_results]
     scores = [f"{result['relevance_score']:.4f}" for result in search_results]
-    adapter.info(f"Found {len(search_results)} relevant chunks")
-    adapter.info(f"Chunk types: {chunk_types}")
-    adapter.info(f"Sections: {sections}")
-    adapter.info(f"Relevance scores: {scores}")
-    # 3. Prioritize chunk types for better context selection
-    prioritized_results = sorted(search_results, key=lambda x: (get_chunk_priority(x), -x['relevance_score']))
-    prioritized_types = [result['metadata'].get('chunk_type', 'unknown') for result in prioritized_results]
-    adapter.info(f"Prioritized chunk types order: {prioritized_types}")
-    # 4. Prepare Context using prioritized results
-    context_chunks = [result['text'] for result in prioritized_results[:TOP_K_CONTEXT]]
     context = "\n---\n".join(context_chunks)
-    # 5. Enhanced context logging
-    context_metadata = []
-    for result in prioritized_results[:TOP_K_CONTEXT]:
-        metadata = result['metadata']
-        context_info = {
-            'section': metadata.get('section', 'unknown'),
-            'clause': metadata.get('clause', 'unknown'),
-            'chunk_type': metadata.get('chunk_type', 'unknown'),
-            'score': f"{result['relevance_score']:.4f}"
-        }
-        context_metadata.append(context_info)
-    adapter.info(f"Selected context metadata: {context_metadata}")
-    # 6. Build Plain Completion Prompt (no [INST] tags)
-    prompt = (
-        "You are a helpful assistant for NEEPCO's Delegation of Powers policy. "
-        "Answer the question using only the provided context.\n\n"
-        f"Context:\n{context}\n\n"
-        f"Question:\n{query.question}\n\n"
-        "Provide a clear, direct answer based only on the context above. If the context doesn't contain the information, "
-        "say \"The provided policy context does not contain information on this topic.\"\n\n"
-        "Answer:\n"
-    )
-    # Optional: log a short preview of the prompt to debug future issues (safe/truncated)
-    try:
-        adapter.info(f"Prompt preview (first 400 chars): {prompt[:400].replace(chr(10),' ')}")
-    except Exception:
-        pass
-    # 7. Generate Response
     answer = "An error occurred while processing your request."
     try:
         adapter.info("Sending prompt to LLM for generation...")
         raw_answer = await asyncio.wait_for(
-            generate_llm_response(prompt, request.state.request_id, adapter),
             timeout=LLM_TIMEOUT_SECONDS
         )
-        adapter.info(f"LLM generation successful. Response length: {len(raw_answer)}")
         # --- POST-PROCESSING LOGIC ---
         if '|' in raw_answer:
             adapter.info("Pipe separator found. Formatting response as a bulleted list.")
             items = raw_answer.split('|')
             cleaned_items = [f"* {item.strip()}" for item in items if item.strip()]
             answer = "\n".join(cleaned_items)
         else:
             answer = raw_answer
     except asyncio.TimeoutError:
@@ -395,8 +229,7 @@ async def chat(query: Query, request: Request):
         adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
         answer = "Sorry, an unexpected error occurred while generating a response."
-    adapter.info("Final answer prepared. Returning to client.")
     return {
         "request_id": request.state.request_id,
         "question": query.question,
@@ -404,42 +237,6 @@ async def chat(query: Query, request: Request):
         "answer": answer
     }
-@app.post("/advanced_search")
-async def advanced_search(query: AdvancedQuery, request: Request):
-    """Advanced search endpoint with explicit filters"""
-    adapter = get_logger_adapter(request)
-    if not db_ready:
-        raise HTTPException(status_code=503, detail="Database not ready")
-    adapter.info(f"Advanced search: question='{query.question}', section='{query.section_filter}', chunk_type='{query.chunk_type_filter}'")
-    search_results = db.search_with_filters(
-        query.question,
-        top_k=query.top_k or TOP_K_SEARCH,
-        section_filter=query.section_filter,
-        chunk_type_filter=query.chunk_type_filter
-    )
-    return {
-        "request_id": request.state.request_id,
-        "query": query.question,
-        "filters": {
-            "section": query.section_filter,
-            "chunk_type": query.chunk_type_filter
-        },
-        "results": [
-            {
-                "text": result['text'],
-                "metadata": result['metadata'],
-                "relevance_score": result['relevance_score']
-            }
-            for result in search_results
-        ]
-    }
 @app.post("/feedback")
 async def collect_feedback(feedback: Feedback, request: Request):
     adapter = get_logger_adapter(request)
@@ -452,6 +249,5 @@ async def collect_feedback(feedback: Feedback, request: Request):
         "feedback": feedback.feedback,
         "comment": feedback.comment
     }
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}

 import asyncio
 import logging
 import uuid
+import re
 from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
 from llama_cpp import Llama
 # Correctly reference the module within the 'app' package
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 # -----------------------------
 # ✅ Logging Configuration
 # -----------------------------
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] - %(message)s')
 class RequestIdAdapter(logging.LoggerAdapter):
     def process(self, msg, kwargs):
+        return '[%s] %s' % (self.extra['request_id'], msg), kwargs
 logger = logging.getLogger("app")
 # -----------------------------
 # ✅ Configuration
 # -----------------------------
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
 CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
+LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "90"))
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
 TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3"))
 TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "1"))
 # -----------------------------
 # ✅ Initialize FastAPI App
 # -----------------------------
+app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.1.0")
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
     response.headers["X-Request-ID"] = request_id
     return response
 # -----------------------------
 # ✅ Vector DB and Data Initialization
 # -----------------------------
         top_k_default=TOP_K_SEARCH,
         relevance_threshold=RELEVANCE_THRESHOLD
     )
     if not ensure_db_populated(db, CHUNKS_FILE_PATH):
         logger.warning("DB not populated on startup. RAG will not function correctly.")
         db_ready = False
     else:
         logger.info("Vector DB is populated and ready.")
         db_ready = True
 except Exception as e:
     logger.error(f"FATAL: Failed to initialize Vector DB: {e}", exc_info=True)
     db = None
     db_ready = False
 # -----------------------------
+# ✅ Load TinyLlama GGUF Model
 # -----------------------------
 logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
         n_ctx=2048,
+        n_threads=1,
+        n_batch=512,
         use_mlock=True,
+        verbose=False
     )
     logger.info("GGUF model loaded successfully.")
     model_ready = True
     llm = None
     model_ready = False
 # -----------------------------
 # ✅ API Schemas
 # -----------------------------
 class Query(BaseModel):
     question: str
 class Feedback(BaseModel):
     request_id: str
     question: str
     feedback: str
     comment: str | None = None
 # -----------------------------
+# ✅ Endpoints
 # -----------------------------
 def get_logger_adapter(request: Request):
     return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
 @app.get("/")
 async def root():
     return {"status": "✅ Server is running."}
 @app.get("/health")
 async def health_check():
     status = {
         "database_status": "ready" if db_ready else "error",
         "model_status": "ready" if model_ready else "error"
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
     return status
+async def generate_llm_response(prompt: str, request_id: str):
+    loop = asyncio.get_running_loop()
+    response = await loop.run_in_executor(
+        None,
+        lambda: llm(prompt, max_tokens=1024, stop=["###", "Question:", "Context:", "</s>"], temperature=0.05, echo=False)
+    )
+    answer = response["choices"][0]["text"].strip()
+    if not answer:
+        raise ValueError("Empty response from LLM")
+    return answer
 @app.post("/chat")
 async def chat(query: Query, request: Request):
     adapter = get_logger_adapter(request)
+    question_lower = query.question.strip().lower()
     # --- GREETING & INTRO HANDLING ---
     greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
     adapter.info(f"Received query: '{query.question}'")
+    # 1. Search Vector DB
+    search_results = db.search(query.question, top_k=TOP_K_SEARCH)
     if not search_results:
         adapter.warning("No relevant context found in vector DB.")
         return {
             "question": query.question,
             "context_used": "No relevant context found.",
             "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
         }
     scores = [f"{result['relevance_score']:.4f}" for result in search_results]
+    adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
+    # 2. Prepare Context
+    context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
     context = "\n---\n".join(context_chunks)
+    # 3. Build Prompt with Separator Instruction
+    prompt = f"""<|system|>
+You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
+Your task is to answer the user's question based ONLY on the provided context.
+- **Formatting Rule:** If the answer contains a list of items or steps, you **MUST** separate each item with a pipe symbol (`|`). For example: `First item|Second item|Third item`.
+- **Content Rule:** If the information is not in the provided context, you **MUST** reply with the exact phrase: "The provided policy context does not contain information on this topic."
+</s>
+<|user|>
+### Relevant Context:
+```
+{context}
+```
+### Question:
+{query.question}
+</s>
+<|assistant|>
+### Detailed Answer:
+"""
+    # 4. Generate Response
     answer = "An error occurred while processing your request."
     try:
         adapter.info("Sending prompt to LLM for generation...")
         raw_answer = await asyncio.wait_for(
+            generate_llm_response(prompt, request.state.request_id),
             timeout=LLM_TIMEOUT_SECONDS
         )
+        adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
         # --- POST-PROCESSING LOGIC ---
+        # Check if the model used the pipe separator, indicating a list.
         if '|' in raw_answer:
             adapter.info("Pipe separator found. Formatting response as a bulleted list.")
+            # Split the string into a list of items
             items = raw_answer.split('|')
+            # Clean up each item and format it as a bullet point
             cleaned_items = [f"* {item.strip()}" for item in items if item.strip()]
+            # Join them back together with newlines
             answer = "\n".join(cleaned_items)
         else:
+            # If no separator, use the answer as is.
             answer = raw_answer
     except asyncio.TimeoutError:
         adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
         answer = "Sorry, an unexpected error occurred while generating a response."
+    adapter.info(f"Final answer prepared. Returning to client.")
     return {
         "request_id": request.state.request_id,
         "question": query.question,
         "answer": answer
     }
 @app.post("/feedback")
 async def collect_feedback(feedback: Feedback, request: Request):
     adapter = get_logger_adapter(request)
         "feedback": feedback.feedback,
         "comment": feedback.comment
     }
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}