Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 18

Commit

7e6e5a8

verified ·

1 Parent(s): b5cf3d3

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +74 -141

app/app.py CHANGED Viewed

@@ -4,8 +4,6 @@ import asyncio
 import logging
 import uuid
 import re
-import multiprocessing as mp
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
 from llama_cpp import Llama
@@ -24,28 +22,21 @@ class RequestIdAdapter(logging.LoggerAdapter):
 logger = logging.getLogger("app")
 # -----------------------------
-# ✅ Configuration - Optimized for CPU
 # -----------------------------
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
 CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
-LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "60"))  # Reduced timeout
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
 TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3"))
 TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "1"))
-# CPU Optimization settings
-CPU_COUNT = mp.cpu_count()
-logger.info(f"Detected {CPU_COUNT} CPU cores")
 # -----------------------------
 # ✅ Initialize FastAPI App
 # -----------------------------
 app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.1.0")
-# Thread pool for async operations
-thread_executor = ThreadPoolExecutor(max_workers=CPU_COUNT * 2)
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
     request_id = str(uuid.uuid4())
@@ -76,26 +67,19 @@ except Exception as e:
     db_ready = False
 # -----------------------------
-# ✅ Load TinyLlama GGUF Model - Optimized
 # -----------------------------
 logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
-        n_ctx=2048,  # Reduced context window for speed
-        n_threads=CPU_COUNT,  # Use all CPU cores
-        n_batch=256,  # Optimized batch size
         use_mlock=True,
-        verbose=False,
-        # Additional optimizations
-        n_gpu_layers=0,  # Force CPU only
-        rope_scaling_type=-1,  # Disable rope scaling for speed
-        use_mmap=True,  # Enable memory mapping
-        low_vram=False,  # We're on CPU
-        # CPU-specific optimizations
-        numa=True,  # Enable NUMA awareness if available
     )
-    logger.info("GGUF model loaded successfully with CPU optimizations.")
     model_ready = True
 except Exception as e:
     logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
@@ -116,49 +100,6 @@ class Feedback(BaseModel):
     feedback: str
     comment: str | None = None
-# -----------------------------
-# ✅ Optimized LLM Generation
-# -----------------------------
-async def generate_llm_response(prompt: str, request_id: str):
-    """Optimized LLM response generation with better CPU utilization."""
-    loop = asyncio.get_running_loop()
-    def generate_response():
-        return llm(
-            prompt,
-            max_tokens=1024,  # Reduced for faster generation
-            stop=["###", "Question:", "Context:", "</s>", "\n\n###"],
-            temperature=0.05,
-            echo=False,
-            # CPU optimizations
-            repeat_penalty=1.1,
-            top_p=0.9,
-            top_k=40,
-            # Faster inference settings
-            typical_p=1.0,
-            mirostat_mode=0,  # Disable for speed
-        )
-    # Use thread executor for better concurrency
-    response = await loop.run_in_executor(thread_executor, generate_response)
-    answer = response["choices"][0]["text"].strip()
-    if not answer:
-        raise ValueError("Empty response from LLM")
-    return answer
-# -----------------------------
-# ✅ Optimized Search Function
-# -----------------------------
-async def perform_optimized_search(query_text: str):
-    """Perform vector search in a separate thread to avoid blocking."""
-    loop = asyncio.get_running_loop()
-    def search_db():
-        return db.search(query_text, top_k=TOP_K_SEARCH)
-    return await loop.run_in_executor(thread_executor, search_db)
 # -----------------------------
 # ✅ Endpoints
 # -----------------------------
@@ -167,21 +108,30 @@ def get_logger_adapter(request: Request):
 @app.get("/")
 async def root():
-    return {"status": "✅ Server is running.", "cpu_cores": CPU_COUNT}
 @app.get("/health")
 async def health_check():
     status = {
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
-        "model_status": "ready" if model_ready else "error",
-        "cpu_cores": CPU_COUNT,
-        "optimization": "enabled"
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
     return status
 @app.post("/chat")
 async def chat(query: Query, request: Request):
     adapter = get_logger_adapter(request)
@@ -192,9 +142,9 @@ async def chat(query: Query, request: Request):
     if question_lower in greeting_keywords:
         adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
         intro_message = (
-            f"Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
-            f"My purpose is to help you find accurate information and answer questions based on this specific dataset. "
-            f"I am currently running optimized on a {CPU_COUNT}-core CPU environment. How can I assist you with the DoP policy today?"
         )
         return {
             "request_id": getattr(request.state, 'request_id', 'N/A'),
@@ -209,86 +159,75 @@ async def chat(query: Query, request: Request):
     adapter.info(f"Received query: '{query.question}'")
-    try:
-        # 1. Perform parallel search and prepare context
-        search_task = perform_optimized_search(query.question)
-        search_results = await search_task
-        if not search_results:
-            adapter.warning("No relevant context found in vector DB.")
-            return {
-                "request_id": request.state.request_id,
-                "question": query.question,
-                "context_used": "No relevant context found.",
-                "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
-            }
-        scores = [f"{result['relevance_score']:.4f}" for result in search_results]
-        adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
-        # 2. Prepare Context (limit context size for faster processing)
-        context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
-        context = "\n---\n".join(context_chunks)
-        # Truncate context if too long for faster processing
-        max_context_length = 1500  # Reduced for faster inference
-        if len(context) > max_context_length:
-            context = context[:max_context_length] + "..."
-        # 3. Build optimized prompt
-        prompt = f"""<|system|>
 You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
 Your task is to answer the user's question based ONLY on the provided context.
 - **Formatting Rule:** If the answer contains a list of items or steps, you **MUST** separate each item with a pipe symbol (`|`). For example: `First item|Second item|Third item`.
 - **Content Rule:** If the information is not in the provided context, you **MUST** reply with the exact phrase: "The provided policy context does not contain information on this topic."
-- **Brevity Rule:** Keep your answer concise and to the point.
 </s>
 <|user|>
 ### Relevant Context:
 ```
 {context}
 ```
 ### Question:
 {query.question}
 </s>
 <|assistant|>
-### Answer:
 """
-        # 4. Generate Response with timeout
-        answer = "An error occurred while processing your request."
-        try:
-            adapter.info("Sending prompt to LLM for generation...")
-            raw_answer = await asyncio.wait_for(
-                generate_llm_response(prompt, request.state.request_id),
-                timeout=LLM_TIMEOUT_SECONDS
-            )
-            adapter.info(f"LLM generation successful. Raw response: {raw_answer[:100]}...")
-            # --- POST-PROCESSING LOGIC ---
-            # Check if the model used the pipe separator, indicating a list.
-            if '|' in raw_answer:
-                adapter.info("Pipe separator found. Formatting response as a bulleted list.")
-                # Split the string into a list of items
-                items = raw_answer.split('|')
-                # Clean up each item and format it as a bullet point
-                cleaned_items = [f"• {item.strip()}" for item in items if item.strip()]
-                # Join them back together with newlines
-                answer = "\n".join(cleaned_items)
-            else:
-                # If no separator, use the answer as is.
-                answer = raw_answer
-        except asyncio.TimeoutError:
-            adapter.warning(f"LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
-            answer = "Sorry, the request took too long to process. Please try again with a simpler question."
-        except Exception as e:
-            adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
-            answer = "Sorry, an unexpected error occurred while generating a response."
     except Exception as e:
-        adapter.error(f"An unexpected error occurred: {e}", exc_info=True)
-        answer = "Sorry, an unexpected error occurred. Please try again."
     adapter.info(f"Final answer prepared. Returning to client.")
     return {
@@ -312,9 +251,3 @@ async def collect_feedback(feedback: Feedback, request: Request):
     }
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}
-# Graceful shutdown
-@app.on_event("shutdown")
-async def shutdown_event():
-    thread_executor.shutdown(wait=True)
-    logger.info("Application shutdown complete.")

 import logging
 import uuid
 import re
 from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
 from llama_cpp import Llama
 logger = logging.getLogger("app")
 # -----------------------------
+# ✅ Configuration
 # -----------------------------
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
 CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
+LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "90"))
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
 TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3"))
 TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "1"))
 # -----------------------------
 # ✅ Initialize FastAPI App
 # -----------------------------
 app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.1.0")
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
     request_id = str(uuid.uuid4())
     db_ready = False
 # -----------------------------
+# ✅ Load TinyLlama GGUF Model
 # -----------------------------
 logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
+        n_ctx=4096,
+        n_threads=4,
+        n_batch=512,
         use_mlock=True,
+        verbose=False
     )
+    logger.info("GGUF model loaded successfully.")
     model_ready = True
 except Exception as e:
     logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
     feedback: str
     comment: str | None = None
 # -----------------------------
 # ✅ Endpoints
 # -----------------------------
 @app.get("/")
 async def root():
+    return {"status": "✅ Server is running."}
 @app.get("/health")
 async def health_check():
     status = {
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
+        "model_status": "ready" if model_ready else "error"
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
     return status
+async def generate_llm_response(prompt: str, request_id: str):
+    loop = asyncio.get_running_loop()
+    response = await loop.run_in_executor(
+        None,
+        lambda: llm(prompt, max_tokens=2048, stop=["###", "Question:", "Context:", "</s>"], temperature=0.05, echo=False)
+    )
+    answer = response["choices"][0]["text"].strip()
+    if not answer:
+        raise ValueError("Empty response from LLM")
+    return answer
 @app.post("/chat")
 async def chat(query: Query, request: Request):
     adapter = get_logger_adapter(request)
     if question_lower in greeting_keywords:
         adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
         intro_message = (
+            "Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
+            "My purpose is to help you find accurate information and answer questions based on this specific dataset. "
+            "I am currently running on a CPU-based environment. How can I assist you with the DoP policy today?"
         )
         return {
             "request_id": getattr(request.state, 'request_id', 'N/A'),
     adapter.info(f"Received query: '{query.question}'")
+    # 1. Search Vector DB
+    search_results = db.search(query.question, top_k=TOP_K_SEARCH)
+    if not search_results:
+        adapter.warning("No relevant context found in vector DB.")
+        return {
+            "question": query.question,
+            "context_used": "No relevant context found.",
+            "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
+        }
+    scores = [f"{result['relevance_score']:.4f}" for result in search_results]
+    adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
+    # 2. Prepare Context
+    context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
+    context = "\n---\n".join(context_chunks)
+    # 3. Build Prompt with Separator Instruction
+    prompt = f"""<|system|>
 You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
 Your task is to answer the user's question based ONLY on the provided context.
 - **Formatting Rule:** If the answer contains a list of items or steps, you **MUST** separate each item with a pipe symbol (`|`). For example: `First item|Second item|Third item`.
 - **Content Rule:** If the information is not in the provided context, you **MUST** reply with the exact phrase: "The provided policy context does not contain information on this topic."
 </s>
 <|user|>
 ### Relevant Context:
 ```
 {context}
 ```
 ### Question:
 {query.question}
 </s>
 <|assistant|>
+### Detailed Answer:
 """
+    # 4. Generate Response
+    answer = "An error occurred while processing your request."
+    try:
+        adapter.info("Sending prompt to LLM for generation...")
+        raw_answer = await asyncio.wait_for(
+            generate_llm_response(prompt, request.state.request_id),
+            timeout=LLM_TIMEOUT_SECONDS
+        )
+        adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
+        # --- POST-PROCESSING LOGIC ---
+        # Check if the model used the pipe separator, indicating a list.
+        if '|' in raw_answer:
+            adapter.info("Pipe separator found. Formatting response as a bulleted list.")
+            # Split the string into a list of items
+            items = raw_answer.split('|')
+            # Clean up each item and format it as a bullet point
+            cleaned_items = [f"* {item.strip()}" for item in items if item.strip()]
+            # Join them back together with newlines
+            answer = "\n".join(cleaned_items)
+        else:
+            # If no separator, use the answer as is.
+            answer = raw_answer
+    except asyncio.TimeoutError:
+        adapter.warning(f"LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
+        answer = "Sorry, the request took too long to process. Please try again with a simpler question."
     except Exception as e:
+        adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
+        answer = "Sorry, an unexpected error occurred while generating a response."
     adapter.info(f"Final answer prepared. Returning to client.")
     return {
     }
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}