Spaces:

Kalpokoch
/

ChatbotDemo

Running

App Files Files

Kalpokoch commited on Aug 18, 2025

Commit

bbdcb91

verified ·

1 Parent(s): a00c3bb

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +151 -176

app/app.py CHANGED Viewed

@@ -4,27 +4,25 @@ import asyncio
 import logging
 import uuid
 import re
-from concurrent.futures import ThreadPoolExecutor
 from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
 from llama_cpp import Llama
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 # -----------------------------
-# ✅ Enhanced Configuration for Maximum CPU Usage
 # -----------------------------
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
 CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
-LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "120"))  # Increased timeout
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
-TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "5"))
 TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "2"))
-# ✅ NEW: CPU optimization parameters
-CPU_CORES = os.cpu_count() or 4  # Detect available CPU cores
-LLM_THREADS = min(CPU_CORES, 4)  # Use all cores for LLM (max 8 for efficiency)
-EXECUTOR_WORKERS = CPU_CORES # More workers for concurrent requests
 # -----------------------------
 # ✅ Logging Configuration
@@ -38,9 +36,12 @@ class RequestIdAdapter(logging.LoggerAdapter):
 logger = logging.getLogger("app")
 # -----------------------------
-# ✅ Initialize FastAPI App
 # -----------------------------
-app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.3.0")
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
@@ -72,37 +73,29 @@ except Exception as e:
     db_ready = False
 # -----------------------------
-# ✅ Optimized GGUF Model Loading
 # -----------------------------
-logger.info(f"Loading GGUF model with {LLM_THREADS} threads from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
-        n_ctx=4096,  # Increased context size
-        n_threads=LLM_THREADS,  # ✅ Use all available CPU cores
-        n_batch=512,  # Increased batch size for better throughput
-        use_mlock=False,  # Disable memory locking for flexibility
         use_mmap=True,  # Enable memory mapping for efficiency
         verbose=False,
         n_gpu_layers=0,  # CPU only
         f16_kv=True,  # Use 16-bit for key-value cache to save memory
     )
-    logger.info(f"GGUF model loaded successfully with {LLM_THREADS} threads.")
     model_ready = True
 except Exception as e:
     logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
     llm = None
     model_ready = False
-# -----------------------------
-# ✅ Dedicated Thread Pool for LLM Inference
-# -----------------------------
-llm_executor = ThreadPoolExecutor(
-    max_workers=EXECUTOR_WORKERS,
-    thread_name_prefix="LLM-Worker"
-)
-logger.info(f"Created LLM thread pool with {EXECUTOR_WORKERS} workers")
 # -----------------------------
 # ✅ API Schemas
 # -----------------------------
@@ -118,7 +111,7 @@ class Feedback(BaseModel):
     comment: str | None = None
 # -----------------------------
-# ✅ Enhanced Query Processing Functions
 # -----------------------------
 def classify_query_type(question: str) -> str:
     """Classify the type of query to choose appropriate search strategy."""
@@ -217,17 +210,17 @@ Your task is to answer the user's question based ONLY on the provided context.
     return prompt
 # -----------------------------
-# ✅ Optimized LLM Response Generation
 # -----------------------------
 def generate_llm_response_sync(prompt: str, request_id: str) -> str:
-    """Synchronous LLM generation for thread pool execution."""
     try:
-        # ✅ Optimized parameters for better CPU utilization
         response = llm(
             prompt,
-            max_tokens=2048,  # Increased token limit
             stop=["###", "Question:", "Context:", "</s>"],
-            temperature=0.1,  # Lower temperature for more consistent responses
             top_p=0.9,
             repeat_penalty=1.1,
             echo=False
@@ -242,24 +235,8 @@ def generate_llm_response_sync(prompt: str, request_id: str) -> str:
         logger.error(f"LLM generation error for request {request_id}: {e}")
         raise
-async def generate_llm_response(prompt: str, request_id: str) -> str:
-    """Async wrapper for LLM generation using dedicated thread pool."""
-    loop = asyncio.get_running_loop()
-    try:
-        # ✅ Use dedicated thread pool for better CPU utilization
-        response = await loop.run_in_executor(
-            llm_executor,
-            generate_llm_response_sync,
-            prompt,
-            request_id
-        )
-        return response
-    except Exception as e:
-        logger.error(f"Async LLM generation error: {e}")
-        raise
 # -----------------------------
-# ✅ Endpoints
 # -----------------------------
 def get_logger_adapter(request: Request):
     return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
@@ -267,10 +244,10 @@ def get_logger_adapter(request: Request):
 @app.get("/")
 async def root():
     return {
-        "status": "✅ Server is running.",
-        "cpu_cores": CPU_CORES,
-        "llm_threads": LLM_THREADS,
-        "executor_workers": EXECUTOR_WORKERS
     }
 @app.get("/health")
@@ -279,11 +256,8 @@ async def health_check():
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
         "model_status": "ready" if model_ready else "error",
-        "cpu_optimization": {
-            "cpu_cores": CPU_CORES,
-            "llm_threads": LLM_THREADS,
-            "executor_workers": EXECUTOR_WORKERS
-        }
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
@@ -291,128 +265,130 @@ async def health_check():
 @app.post("/chat")
 async def chat(query: Query, request: Request):
-    adapter = get_logger_adapter(request)
-    question_lower = query.question.strip().lower()
-    # Greeting handling
-    greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
-    if question_lower in greeting_keywords:
-        adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
-        intro_message = (
-            "Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
-            "I can help you find accurate information about approval authorities, monetary limits, procedures, and policy requirements. "
-            "How can I assist you with the DoP policy today?"
-        )
-        return {
-            "request_id": getattr(request.state, 'request_id', 'N/A'),
-            "question": query.question,
-            "context_used": "NA - Greeting",
-            "answer": intro_message
-        }
-    if not db_ready or not model_ready:
-        adapter.error("Service unavailable due to initialization failure.")
-        raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
-    adapter.info(f"Received query: '{query.question}'")
-    # Query classification and search
-    query_type = classify_query_type(query.question)
-    adapter.info(f"Query classified as: {query_type}")
-    search_results = []
-    if query_type == "monetary":
-        amount = extract_monetary_amount(query.question)
-        if amount:
-            adapter.info(f"Extracted monetary amount: ₹{amount}")
-            monetary_results = db.search_by_amount(amount, comparison=">=", top_k=TOP_K_SEARCH)
-            if monetary_results:
-                search_results = monetary_results
-                adapter.info(f"Found {len(search_results)} results using monetary search")
-    if not search_results:
-        search_results = db.search_with_context(
-            query.question,
-            top_k=TOP_K_SEARCH,
-            include_related=True
-        )
-        adapter.info(f"Found {len(search_results)} results using semantic search with context")
-    if not search_results:
-        adapter.warning("No relevant context found in vector DB.")
-        return {
-            "request_id": getattr(request.state, 'request_id', 'N/A'),
-            "question": query.question,
-            "context_used": "No relevant context found.",
-            "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing or ask about specific delegation limits, approval authorities, or procedures."
-        }
-    # Log search results with metadata
-    result_info = []
-    for i, result in enumerate(search_results):
-        metadata = result.get('metadata', {})
-        role = metadata.get('role', 'N/A')
-        section = metadata.get('section', 'N/A')
-        score = result.get('relevance_score', 0)
-        result_info.append(f"#{i+1}: Score={score:.3f}, Role={role}, Section={section}")
-    adapter.info(f"Search results: {' | '.join(result_info)}")
-    # Prepare context with metadata
-    context_chunks = []
-    for result in search_results[:TOP_K_CONTEXT]:
-        chunk_text = result['text']
-        metadata = result.get('metadata', {})
-        if metadata.get('section') or metadata.get('role'):
-            metadata_prefix = f"[Section: {metadata.get('section', 'N/A')}, Role: {metadata.get('role', 'N/A')}] "
-            chunk_text = metadata_prefix + chunk_text
-        context_chunks.append(chunk_text)
-    context = "\n---\n".join(context_chunks)
-    prompt = build_enhanced_prompt(query.question, context, query_type, search_results)
-    # Generate response
-    answer = "An error occurred while processing your request."
-    try:
-        adapter.info(f"Sending enhanced prompt to LLM for {query_type} query...")
-        raw_answer = await asyncio.wait_for(
-            generate_llm_response(prompt, request.state.request_id),
-            timeout=LLM_TIMEOUT_SECONDS
-        )
-        adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
-        # Post-processing
-        if '|' in raw_answer:
-            adapter.info("Pipe separator found. Formatting response as a bulleted list.")
-            items = raw_answer.split('|')
-            cleaned_items = [f"• {item.strip()}" for item in items if item.strip()]
-            answer = "\n".join(cleaned_items)
-        else:
-            answer = raw_answer.strip()
-            if query_type == "monetary" and "₹" not in answer and extract_monetary_amount(query.question):
-                amount = extract_monetary_amount(query.question)
-                answer = f"For amounts of ₹{amount:,.0f}:\n\n{answer}"
-    except asyncio.TimeoutError:
-        adapter.warning(f"LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
-        answer = "Sorry, the request took too long to process. Please try again with a simpler question."
-    except Exception as e:
-        adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
-        answer = "Sorry, an unexpected error occurred while generating a response."
-    adapter.info(f"Final answer prepared for {query_type} query. Returning to client.")
-    return {
-        "request_id": request.state.request_id,
-        "question": query.question,
-        "context_used": context,
-        "answer": answer,
-        "query_type": query_type,
-        "search_strategy": "monetary" if query_type == "monetary" and extract_monetary_amount(query.question) else "semantic_with_context"
-    }
 @app.post("/feedback")
 async def collect_feedback(feedback: Feedback, request: Request):
@@ -429,8 +405,7 @@ async def collect_feedback(feedback: Feedback, request: Request):
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}
-# ✅ Cleanup on shutdown
 @app.on_event("shutdown")
 async def shutdown_event():
-    llm_executor.shutdown(wait=True)
-    logger.info("Thread pool executor shut down successfully.")

 import logging
 import uuid
 import re
 from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
 from llama_cpp import Llama
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 # -----------------------------
+# ✅ Optimized Configuration for Hugging Face Free Tier
 # -----------------------------
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
 CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
+LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "60"))  # Reduced timeout for free tier
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
+TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3"))  # Reduced for efficiency
 TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "2"))
+# ✅ Single-threaded CPU optimization
+LLM_THREADS = 1  # Single thread for free tier
+MAX_CONCURRENT_REQUESTS = 1  # Process one request at a time
 # -----------------------------
 # ✅ Logging Configuration
 logger = logging.getLogger("app")
 # -----------------------------
+# ✅ Initialize FastAPI App with Request Limiting
 # -----------------------------
+app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.4.0")
+# ✅ Request queue to ensure single processing
+request_semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
     db_ready = False
 # -----------------------------
+# ✅ Memory-Optimized GGUF Model Loading for Free Tier
 # -----------------------------
+logger.info(f"Loading GGUF model for single-threaded processing from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
+        n_ctx=2048,  # Reduced context size for memory efficiency
+        n_threads=LLM_THREADS,  # Single thread
+        n_batch=256,  # Smaller batch size for memory efficiency
+        use_mlock=False,  # Disable memory locking
         use_mmap=True,  # Enable memory mapping for efficiency
         verbose=False,
         n_gpu_layers=0,  # CPU only
         f16_kv=True,  # Use 16-bit for key-value cache to save memory
+        low_vram=True,  # Enable low VRAM mode for better memory usage
     )
+    logger.info("GGUF model loaded successfully for single-threaded processing.")
     model_ready = True
 except Exception as e:
     logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
     llm = None
     model_ready = False
 # -----------------------------
 # ✅ API Schemas
 # -----------------------------
     comment: str | None = None
 # -----------------------------
+# ✅ Query Processing Functions (Unchanged)
 # -----------------------------
 def classify_query_type(question: str) -> str:
     """Classify the type of query to choose appropriate search strategy."""
     return prompt
 # -----------------------------
+# ✅ Synchronous LLM Response Generation (No Threading)
 # -----------------------------
 def generate_llm_response_sync(prompt: str, request_id: str) -> str:
+    """Synchronous LLM generation optimized for single-threaded processing."""
     try:
+        # ✅ Optimized parameters for free tier CPU
         response = llm(
             prompt,
+            max_tokens=1024,  # Reduced token limit for faster processing
             stop=["###", "Question:", "Context:", "</s>"],
+            temperature=0.1,  # Lower temperature for consistent responses
             top_p=0.9,
             repeat_penalty=1.1,
             echo=False
         logger.error(f"LLM generation error for request {request_id}: {e}")
         raise
 # -----------------------------
+# ✅ Endpoints with Request Limiting
 # -----------------------------
 def get_logger_adapter(request: Request):
     return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
 @app.get("/")
 async def root():
     return {
+        "status": "✅ Server is running on Hugging Face Free Tier",
+        "mode": "Single-threaded processing",
+        "max_concurrent_requests": MAX_CONCURRENT_REQUESTS,
+        "llm_threads": LLM_THREADS
     }
 @app.get("/health")
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
         "model_status": "ready" if model_ready else "error",
+        "processing_mode": "single_threaded",
+        "max_concurrent_requests": MAX_CONCURRENT_REQUESTS
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
 @app.post("/chat")
 async def chat(query: Query, request: Request):
+    # ✅ Acquire semaphore to ensure single request processing
+    async with request_semaphore:
+        adapter = get_logger_adapter(request)
+        adapter.info("Processing request (single-threaded mode)")
+        question_lower = query.question.strip().lower()
+        # Greeting handling
+        greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
+        if question_lower in greeting_keywords:
+            adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
+            intro_message = (
+                "Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
+                "I can help you find accurate information about approval authorities, monetary limits, procedures, and policy requirements. "
+                "How can I assist you with the DoP policy today?"
+            )
+            return {
+                "request_id": getattr(request.state, 'request_id', 'N/A'),
+                "question": query.question,
+                "context_used": "NA - Greeting",
+                "answer": intro_message
+            }
+        if not db_ready or not model_ready:
+            adapter.error("Service unavailable due to initialization failure.")
+            raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
+        adapter.info(f"Received query: '{query.question}'")
+        # Query classification and search
+        query_type = classify_query_type(query.question)
+        adapter.info(f"Query classified as: {query_type}")
+        search_results = []
+        if query_type == "monetary":
+            amount = extract_monetary_amount(query.question)
+            if amount:
+                adapter.info(f"Extracted monetary amount: ₹{amount}")
+                monetary_results = db.search_by_amount(amount, comparison=">=", top_k=TOP_K_SEARCH)
+                if monetary_results:
+                    search_results = monetary_results
+                    adapter.info(f"Found {len(search_results)} results using monetary search")
+        if not search_results:
+            search_results = db.search_with_context(
+                query.question,
+                top_k=TOP_K_SEARCH,
+                include_related=True
+            )
+            adapter.info(f"Found {len(search_results)} results using semantic search with context")
+        if not search_results:
+            adapter.warning("No relevant context found in vector DB.")
+            return {
+                "request_id": getattr(request.state, 'request_id', 'N/A'),
+                "question": query.question,
+                "context_used": "No relevant context found.",
+                "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing or ask about specific delegation limits, approval authorities, or procedures."
+            }
+        # Log search results with metadata
+        result_info = []
+        for i, result in enumerate(search_results):
+            metadata = result.get('metadata', {})
+            role = metadata.get('role', 'N/A')
+            section = metadata.get('section', 'N/A')
+            score = result.get('relevance_score', 0)
+            result_info.append(f"#{i+1}: Score={score:.3f}, Role={role}, Section={section}")
+        adapter.info(f"Search results: {' | '.join(result_info)}")
+        # Prepare context with metadata
+        context_chunks = []
+        for result in search_results[:TOP_K_CONTEXT]:
+            chunk_text = result['text']
+            metadata = result.get('metadata', {})
+            if metadata.get('section') or metadata.get('role'):
+                metadata_prefix = f"[Section: {metadata.get('section', 'N/A')}, Role: {metadata.get('role', 'N/A')}] "
+                chunk_text = metadata_prefix + chunk_text
+            context_chunks.append(chunk_text)
+        context = "\n---\n".join(context_chunks)
+        prompt = build_enhanced_prompt(query.question, context, query_type, search_results)
+        # Generate response synchronously
+        answer = "An error occurred while processing your request."
+        try:
+            adapter.info(f"Sending prompt to LLM for {query_type} query (synchronous processing)...")
+            # ✅ Direct synchronous call - no threading or async execution
+            raw_answer = generate_llm_response_sync(prompt, request.state.request_id)
+            adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
+            # Post-processing
+            if '|' in raw_answer:
+                adapter.info("Pipe separator found. Formatting response as a bulleted list.")
+                items = raw_answer.split('|')
+                cleaned_items = [f"• {item.strip()}" for item in items if item.strip()]
+                answer = "\n".join(cleaned_items)
+            else:
+                answer = raw_answer.strip()
+                if query_type == "monetary" and "₹" not in answer and extract_monetary_amount(query.question):
+                    amount = extract_monetary_amount(query.question)
+                    answer = f"For amounts of ₹{amount:,.0f}:\n\n{answer}"
+        except Exception as e:
+            adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
+            answer = "Sorry, an unexpected error occurred while generating a response."
+        adapter.info(f"Final answer prepared for {query_type} query. Returning to client.")
+        return {
+            "request_id": request.state.request_id,
+            "question": query.question,
+            "context_used": context,
+            "answer": answer,
+            "query_type": query_type,
+            "search_strategy": "monetary" if query_type == "monetary" and extract_monetary_amount(query.question) else "semantic_with_context",
+            "processing_mode": "single_threaded"
+        }
 @app.post("/feedback")
 async def collect_feedback(feedback: Feedback, request: Request):
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}
+# ✅ No cleanup needed for single-threaded processing
 @app.on_event("shutdown")
 async def shutdown_event():
+    logger.info("Application shutting down (single-threaded mode).")