Kalpokoch commited on
Commit
bbdcb91
Β·
verified Β·
1 Parent(s): a00c3bb

Update app/app.py

Browse files
Files changed (1) hide show
  1. app/app.py +151 -176
app/app.py CHANGED
@@ -4,27 +4,25 @@ import asyncio
4
  import logging
5
  import uuid
6
  import re
7
- from concurrent.futures import ThreadPoolExecutor
8
  from fastapi import FastAPI, HTTPException, Request
9
  from pydantic import BaseModel
10
  from llama_cpp import Llama
11
  from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
12
 
13
  # -----------------------------
14
- # βœ… Enhanced Configuration for Maximum CPU Usage
15
  # -----------------------------
16
  DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
17
  CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
18
  MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
19
- LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "120")) # Increased timeout
20
  RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
21
- TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "5"))
22
  TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "2"))
23
 
24
- # βœ… NEW: CPU optimization parameters
25
- CPU_CORES = os.cpu_count() or 4 # Detect available CPU cores
26
- LLM_THREADS = min(CPU_CORES, 4) # Use all cores for LLM (max 8 for efficiency)
27
- EXECUTOR_WORKERS = CPU_CORES # More workers for concurrent requests
28
 
29
  # -----------------------------
30
  # βœ… Logging Configuration
@@ -38,9 +36,12 @@ class RequestIdAdapter(logging.LoggerAdapter):
38
  logger = logging.getLogger("app")
39
 
40
  # -----------------------------
41
- # βœ… Initialize FastAPI App
42
  # -----------------------------
43
- app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.3.0")
 
 
 
44
 
45
  @app.middleware("http")
46
  async def add_request_id(request: Request, call_next):
@@ -72,37 +73,29 @@ except Exception as e:
72
  db_ready = False
73
 
74
  # -----------------------------
75
- # βœ… Optimized GGUF Model Loading
76
  # -----------------------------
77
- logger.info(f"Loading GGUF model with {LLM_THREADS} threads from: {MODEL_PATH}")
78
  try:
79
  llm = Llama(
80
  model_path=MODEL_PATH,
81
- n_ctx=4096, # Increased context size
82
- n_threads=LLM_THREADS, # βœ… Use all available CPU cores
83
- n_batch=512, # Increased batch size for better throughput
84
- use_mlock=False, # Disable memory locking for flexibility
85
  use_mmap=True, # Enable memory mapping for efficiency
86
  verbose=False,
87
  n_gpu_layers=0, # CPU only
88
  f16_kv=True, # Use 16-bit for key-value cache to save memory
 
89
  )
90
- logger.info(f"GGUF model loaded successfully with {LLM_THREADS} threads.")
91
  model_ready = True
92
  except Exception as e:
93
  logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
94
  llm = None
95
  model_ready = False
96
 
97
- # -----------------------------
98
- # βœ… Dedicated Thread Pool for LLM Inference
99
- # -----------------------------
100
- llm_executor = ThreadPoolExecutor(
101
- max_workers=EXECUTOR_WORKERS,
102
- thread_name_prefix="LLM-Worker"
103
- )
104
- logger.info(f"Created LLM thread pool with {EXECUTOR_WORKERS} workers")
105
-
106
  # -----------------------------
107
  # βœ… API Schemas
108
  # -----------------------------
@@ -118,7 +111,7 @@ class Feedback(BaseModel):
118
  comment: str | None = None
119
 
120
  # -----------------------------
121
- # βœ… Enhanced Query Processing Functions
122
  # -----------------------------
123
  def classify_query_type(question: str) -> str:
124
  """Classify the type of query to choose appropriate search strategy."""
@@ -217,17 +210,17 @@ Your task is to answer the user's question based ONLY on the provided context.
217
  return prompt
218
 
219
  # -----------------------------
220
- # βœ… Optimized LLM Response Generation
221
  # -----------------------------
222
  def generate_llm_response_sync(prompt: str, request_id: str) -> str:
223
- """Synchronous LLM generation for thread pool execution."""
224
  try:
225
- # βœ… Optimized parameters for better CPU utilization
226
  response = llm(
227
  prompt,
228
- max_tokens=2048, # Increased token limit
229
  stop=["###", "Question:", "Context:", "</s>"],
230
- temperature=0.1, # Lower temperature for more consistent responses
231
  top_p=0.9,
232
  repeat_penalty=1.1,
233
  echo=False
@@ -242,24 +235,8 @@ def generate_llm_response_sync(prompt: str, request_id: str) -> str:
242
  logger.error(f"LLM generation error for request {request_id}: {e}")
243
  raise
244
 
245
- async def generate_llm_response(prompt: str, request_id: str) -> str:
246
- """Async wrapper for LLM generation using dedicated thread pool."""
247
- loop = asyncio.get_running_loop()
248
- try:
249
- # βœ… Use dedicated thread pool for better CPU utilization
250
- response = await loop.run_in_executor(
251
- llm_executor,
252
- generate_llm_response_sync,
253
- prompt,
254
- request_id
255
- )
256
- return response
257
- except Exception as e:
258
- logger.error(f"Async LLM generation error: {e}")
259
- raise
260
-
261
  # -----------------------------
262
- # βœ… Endpoints
263
  # -----------------------------
264
  def get_logger_adapter(request: Request):
265
  return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
@@ -267,10 +244,10 @@ def get_logger_adapter(request: Request):
267
  @app.get("/")
268
  async def root():
269
  return {
270
- "status": "βœ… Server is running.",
271
- "cpu_cores": CPU_CORES,
272
- "llm_threads": LLM_THREADS,
273
- "executor_workers": EXECUTOR_WORKERS
274
  }
275
 
276
  @app.get("/health")
@@ -279,11 +256,8 @@ async def health_check():
279
  "status": "ok",
280
  "database_status": "ready" if db_ready else "error",
281
  "model_status": "ready" if model_ready else "error",
282
- "cpu_optimization": {
283
- "cpu_cores": CPU_CORES,
284
- "llm_threads": LLM_THREADS,
285
- "executor_workers": EXECUTOR_WORKERS
286
- }
287
  }
288
  if not db_ready or not model_ready:
289
  raise HTTPException(status_code=503, detail=status)
@@ -291,128 +265,130 @@ async def health_check():
291
 
292
  @app.post("/chat")
293
  async def chat(query: Query, request: Request):
294
- adapter = get_logger_adapter(request)
295
- question_lower = query.question.strip().lower()
296
-
297
- # Greeting handling
298
- greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
299
- if question_lower in greeting_keywords:
300
- adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
301
- intro_message = (
302
- "Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
303
- "I can help you find accurate information about approval authorities, monetary limits, procedures, and policy requirements. "
304
- "How can I assist you with the DoP policy today?"
305
- )
306
- return {
307
- "request_id": getattr(request.state, 'request_id', 'N/A'),
308
- "question": query.question,
309
- "context_used": "NA - Greeting",
310
- "answer": intro_message
311
- }
312
-
313
- if not db_ready or not model_ready:
314
- adapter.error("Service unavailable due to initialization failure.")
315
- raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
316
-
317
- adapter.info(f"Received query: '{query.question}'")
318
-
319
- # Query classification and search
320
- query_type = classify_query_type(query.question)
321
- adapter.info(f"Query classified as: {query_type}")
322
-
323
- search_results = []
324
-
325
- if query_type == "monetary":
326
- amount = extract_monetary_amount(query.question)
327
- if amount:
328
- adapter.info(f"Extracted monetary amount: β‚Ή{amount}")
329
- monetary_results = db.search_by_amount(amount, comparison=">=", top_k=TOP_K_SEARCH)
330
- if monetary_results:
331
- search_results = monetary_results
332
- adapter.info(f"Found {len(search_results)} results using monetary search")
333
-
334
- if not search_results:
335
- search_results = db.search_with_context(
336
- query.question,
337
- top_k=TOP_K_SEARCH,
338
- include_related=True
339
- )
340
- adapter.info(f"Found {len(search_results)} results using semantic search with context")
341
-
342
- if not search_results:
343
- adapter.warning("No relevant context found in vector DB.")
344
- return {
345
- "request_id": getattr(request.state, 'request_id', 'N/A'),
346
- "question": query.question,
347
- "context_used": "No relevant context found.",
348
- "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing or ask about specific delegation limits, approval authorities, or procedures."
349
- }
350
-
351
- # Log search results with metadata
352
- result_info = []
353
- for i, result in enumerate(search_results):
354
- metadata = result.get('metadata', {})
355
- role = metadata.get('role', 'N/A')
356
- section = metadata.get('section', 'N/A')
357
- score = result.get('relevance_score', 0)
358
- result_info.append(f"#{i+1}: Score={score:.3f}, Role={role}, Section={section}")
359
-
360
- adapter.info(f"Search results: {' | '.join(result_info)}")
361
-
362
- # Prepare context with metadata
363
- context_chunks = []
364
- for result in search_results[:TOP_K_CONTEXT]:
365
- chunk_text = result['text']
366
- metadata = result.get('metadata', {})
367
 
368
- if metadata.get('section') or metadata.get('role'):
369
- metadata_prefix = f"[Section: {metadata.get('section', 'N/A')}, Role: {metadata.get('role', 'N/A')}] "
370
- chunk_text = metadata_prefix + chunk_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
 
372
- context_chunks.append(chunk_text)
373
-
374
- context = "\n---\n".join(context_chunks)
375
- prompt = build_enhanced_prompt(query.question, context, query_type, search_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
- # Generate response
378
- answer = "An error occurred while processing your request."
379
- try:
380
- adapter.info(f"Sending enhanced prompt to LLM for {query_type} query...")
381
- raw_answer = await asyncio.wait_for(
382
- generate_llm_response(prompt, request.state.request_id),
383
- timeout=LLM_TIMEOUT_SECONDS
384
- )
385
- adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
 
 
386
 
387
- # Post-processing
388
- if '|' in raw_answer:
389
- adapter.info("Pipe separator found. Formatting response as a bulleted list.")
390
- items = raw_answer.split('|')
391
- cleaned_items = [f"β€’ {item.strip()}" for item in items if item.strip()]
392
- answer = "\n".join(cleaned_items)
393
- else:
394
- answer = raw_answer.strip()
395
 
396
- if query_type == "monetary" and "β‚Ή" not in answer and extract_monetary_amount(query.question):
397
- amount = extract_monetary_amount(query.question)
398
- answer = f"For amounts of β‚Ή{amount:,.0f}:\n\n{answer}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
- except asyncio.TimeoutError:
401
- adapter.warning(f"LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
402
- answer = "Sorry, the request took too long to process. Please try again with a simpler question."
403
- except Exception as e:
404
- adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
405
- answer = "Sorry, an unexpected error occurred while generating a response."
406
 
407
- adapter.info(f"Final answer prepared for {query_type} query. Returning to client.")
408
- return {
409
- "request_id": request.state.request_id,
410
- "question": query.question,
411
- "context_used": context,
412
- "answer": answer,
413
- "query_type": query_type,
414
- "search_strategy": "monetary" if query_type == "monetary" and extract_monetary_amount(query.question) else "semantic_with_context"
415
- }
 
416
 
417
  @app.post("/feedback")
418
  async def collect_feedback(feedback: Feedback, request: Request):
@@ -429,8 +405,7 @@ async def collect_feedback(feedback: Feedback, request: Request):
429
  adapter.info(json.dumps(feedback_log))
430
  return {"status": "βœ… Feedback recorded. Thank you!"}
431
 
432
- # βœ… Cleanup on shutdown
433
  @app.on_event("shutdown")
434
  async def shutdown_event():
435
- llm_executor.shutdown(wait=True)
436
- logger.info("Thread pool executor shut down successfully.")
 
4
  import logging
5
  import uuid
6
  import re
 
7
  from fastapi import FastAPI, HTTPException, Request
8
  from pydantic import BaseModel
9
  from llama_cpp import Llama
10
  from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
11
 
12
  # -----------------------------
13
+ # βœ… Optimized Configuration for Hugging Face Free Tier
14
  # -----------------------------
15
  DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
16
  CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
17
  MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
18
+ LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "60")) # Reduced timeout for free tier
19
  RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
20
+ TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3")) # Reduced for efficiency
21
  TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "2"))
22
 
23
+ # βœ… Single-threaded CPU optimization
24
+ LLM_THREADS = 1 # Single thread for free tier
25
+ MAX_CONCURRENT_REQUESTS = 1 # Process one request at a time
 
26
 
27
  # -----------------------------
28
  # βœ… Logging Configuration
 
36
  logger = logging.getLogger("app")
37
 
38
  # -----------------------------
39
+ # βœ… Initialize FastAPI App with Request Limiting
40
  # -----------------------------
41
+ app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.4.0")
42
+
43
+ # βœ… Request queue to ensure single processing
44
+ request_semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
45
 
46
  @app.middleware("http")
47
  async def add_request_id(request: Request, call_next):
 
73
  db_ready = False
74
 
75
  # -----------------------------
76
+ # βœ… Memory-Optimized GGUF Model Loading for Free Tier
77
  # -----------------------------
78
+ logger.info(f"Loading GGUF model for single-threaded processing from: {MODEL_PATH}")
79
  try:
80
  llm = Llama(
81
  model_path=MODEL_PATH,
82
+ n_ctx=2048, # Reduced context size for memory efficiency
83
+ n_threads=LLM_THREADS, # Single thread
84
+ n_batch=256, # Smaller batch size for memory efficiency
85
+ use_mlock=False, # Disable memory locking
86
  use_mmap=True, # Enable memory mapping for efficiency
87
  verbose=False,
88
  n_gpu_layers=0, # CPU only
89
  f16_kv=True, # Use 16-bit for key-value cache to save memory
90
+ low_vram=True, # Enable low VRAM mode for better memory usage
91
  )
92
+ logger.info("GGUF model loaded successfully for single-threaded processing.")
93
  model_ready = True
94
  except Exception as e:
95
  logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
96
  llm = None
97
  model_ready = False
98
 
 
 
 
 
 
 
 
 
 
99
  # -----------------------------
100
  # βœ… API Schemas
101
  # -----------------------------
 
111
  comment: str | None = None
112
 
113
  # -----------------------------
114
+ # βœ… Query Processing Functions (Unchanged)
115
  # -----------------------------
116
  def classify_query_type(question: str) -> str:
117
  """Classify the type of query to choose appropriate search strategy."""
 
210
  return prompt
211
 
212
  # -----------------------------
213
+ # βœ… Synchronous LLM Response Generation (No Threading)
214
  # -----------------------------
215
  def generate_llm_response_sync(prompt: str, request_id: str) -> str:
216
+ """Synchronous LLM generation optimized for single-threaded processing."""
217
  try:
218
+ # βœ… Optimized parameters for free tier CPU
219
  response = llm(
220
  prompt,
221
+ max_tokens=1024, # Reduced token limit for faster processing
222
  stop=["###", "Question:", "Context:", "</s>"],
223
+ temperature=0.1, # Lower temperature for consistent responses
224
  top_p=0.9,
225
  repeat_penalty=1.1,
226
  echo=False
 
235
  logger.error(f"LLM generation error for request {request_id}: {e}")
236
  raise
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  # -----------------------------
239
+ # βœ… Endpoints with Request Limiting
240
  # -----------------------------
241
  def get_logger_adapter(request: Request):
242
  return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
 
244
  @app.get("/")
245
  async def root():
246
  return {
247
+ "status": "βœ… Server is running on Hugging Face Free Tier",
248
+ "mode": "Single-threaded processing",
249
+ "max_concurrent_requests": MAX_CONCURRENT_REQUESTS,
250
+ "llm_threads": LLM_THREADS
251
  }
252
 
253
  @app.get("/health")
 
256
  "status": "ok",
257
  "database_status": "ready" if db_ready else "error",
258
  "model_status": "ready" if model_ready else "error",
259
+ "processing_mode": "single_threaded",
260
+ "max_concurrent_requests": MAX_CONCURRENT_REQUESTS
 
 
 
261
  }
262
  if not db_ready or not model_ready:
263
  raise HTTPException(status_code=503, detail=status)
 
265
 
266
  @app.post("/chat")
267
  async def chat(query: Query, request: Request):
268
+ # βœ… Acquire semaphore to ensure single request processing
269
+ async with request_semaphore:
270
+ adapter = get_logger_adapter(request)
271
+ adapter.info("Processing request (single-threaded mode)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
+ question_lower = query.question.strip().lower()
274
+
275
+ # Greeting handling
276
+ greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
277
+ if question_lower in greeting_keywords:
278
+ adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
279
+ intro_message = (
280
+ "Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
281
+ "I can help you find accurate information about approval authorities, monetary limits, procedures, and policy requirements. "
282
+ "How can I assist you with the DoP policy today?"
283
+ )
284
+ return {
285
+ "request_id": getattr(request.state, 'request_id', 'N/A'),
286
+ "question": query.question,
287
+ "context_used": "NA - Greeting",
288
+ "answer": intro_message
289
+ }
290
+
291
+ if not db_ready or not model_ready:
292
+ adapter.error("Service unavailable due to initialization failure.")
293
+ raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
294
+
295
+ adapter.info(f"Received query: '{query.question}'")
296
+
297
+ # Query classification and search
298
+ query_type = classify_query_type(query.question)
299
+ adapter.info(f"Query classified as: {query_type}")
300
+
301
+ search_results = []
302
 
303
+ if query_type == "monetary":
304
+ amount = extract_monetary_amount(query.question)
305
+ if amount:
306
+ adapter.info(f"Extracted monetary amount: β‚Ή{amount}")
307
+ monetary_results = db.search_by_amount(amount, comparison=">=", top_k=TOP_K_SEARCH)
308
+ if monetary_results:
309
+ search_results = monetary_results
310
+ adapter.info(f"Found {len(search_results)} results using monetary search")
311
+
312
+ if not search_results:
313
+ search_results = db.search_with_context(
314
+ query.question,
315
+ top_k=TOP_K_SEARCH,
316
+ include_related=True
317
+ )
318
+ adapter.info(f"Found {len(search_results)} results using semantic search with context")
319
+
320
+ if not search_results:
321
+ adapter.warning("No relevant context found in vector DB.")
322
+ return {
323
+ "request_id": getattr(request.state, 'request_id', 'N/A'),
324
+ "question": query.question,
325
+ "context_used": "No relevant context found.",
326
+ "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing or ask about specific delegation limits, approval authorities, or procedures."
327
+ }
328
+
329
+ # Log search results with metadata
330
+ result_info = []
331
+ for i, result in enumerate(search_results):
332
+ metadata = result.get('metadata', {})
333
+ role = metadata.get('role', 'N/A')
334
+ section = metadata.get('section', 'N/A')
335
+ score = result.get('relevance_score', 0)
336
+ result_info.append(f"#{i+1}: Score={score:.3f}, Role={role}, Section={section}")
337
+
338
+ adapter.info(f"Search results: {' | '.join(result_info)}")
339
 
340
+ # Prepare context with metadata
341
+ context_chunks = []
342
+ for result in search_results[:TOP_K_CONTEXT]:
343
+ chunk_text = result['text']
344
+ metadata = result.get('metadata', {})
345
+
346
+ if metadata.get('section') or metadata.get('role'):
347
+ metadata_prefix = f"[Section: {metadata.get('section', 'N/A')}, Role: {metadata.get('role', 'N/A')}] "
348
+ chunk_text = metadata_prefix + chunk_text
349
+
350
+ context_chunks.append(chunk_text)
351
 
352
+ context = "\n---\n".join(context_chunks)
353
+ prompt = build_enhanced_prompt(query.question, context, query_type, search_results)
354
+
355
+ # Generate response synchronously
356
+ answer = "An error occurred while processing your request."
357
+ try:
358
+ adapter.info(f"Sending prompt to LLM for {query_type} query (synchronous processing)...")
 
359
 
360
+ # βœ… Direct synchronous call - no threading or async execution
361
+ raw_answer = generate_llm_response_sync(prompt, request.state.request_id)
362
+
363
+ adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
364
+
365
+ # Post-processing
366
+ if '|' in raw_answer:
367
+ adapter.info("Pipe separator found. Formatting response as a bulleted list.")
368
+ items = raw_answer.split('|')
369
+ cleaned_items = [f"β€’ {item.strip()}" for item in items if item.strip()]
370
+ answer = "\n".join(cleaned_items)
371
+ else:
372
+ answer = raw_answer.strip()
373
+
374
+ if query_type == "monetary" and "β‚Ή" not in answer and extract_monetary_amount(query.question):
375
+ amount = extract_monetary_amount(query.question)
376
+ answer = f"For amounts of β‚Ή{amount:,.0f}:\n\n{answer}"
377
 
378
+ except Exception as e:
379
+ adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
380
+ answer = "Sorry, an unexpected error occurred while generating a response."
 
 
 
381
 
382
+ adapter.info(f"Final answer prepared for {query_type} query. Returning to client.")
383
+ return {
384
+ "request_id": request.state.request_id,
385
+ "question": query.question,
386
+ "context_used": context,
387
+ "answer": answer,
388
+ "query_type": query_type,
389
+ "search_strategy": "monetary" if query_type == "monetary" and extract_monetary_amount(query.question) else "semantic_with_context",
390
+ "processing_mode": "single_threaded"
391
+ }
392
 
393
  @app.post("/feedback")
394
  async def collect_feedback(feedback: Feedback, request: Request):
 
405
  adapter.info(json.dumps(feedback_log))
406
  return {"status": "βœ… Feedback recorded. Thank you!"}
407
 
408
+ # βœ… No cleanup needed for single-threaded processing
409
  @app.on_event("shutdown")
410
  async def shutdown_event():
411
+ logger.info("Application shutting down (single-threaded mode).")