Kalpokoch commited on
Commit
08a91ba
Β·
verified Β·
1 Parent(s): 9d9eece

Update app/app.py

Browse files
Files changed (1) hide show
  1. app/app.py +69 -50
app/app.py CHANGED
@@ -12,13 +12,10 @@ from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
12
  # -----------------------------
13
  # βœ… Logging Configuration
14
  # -----------------------------
15
- # βœ… IMPROVEMENT: More detailed and structured logging format.
16
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] - %(message)s')
17
 
18
- # βœ… IMPROVEMENT: Custom adapter to inject a request ID into every log message for better traceability.
19
  class RequestIdAdapter(logging.LoggerAdapter):
20
  def process(self, msg, kwargs):
21
- # The request_id is injected into the 'extra' dict.
22
  return '[%s] %s' % (self.extra['request_id'], msg), kwargs
23
 
24
  logger = logging.getLogger("app")
@@ -26,11 +23,10 @@ logger = logging.getLogger("app")
26
  # -----------------------------
27
  # βœ… Configuration
28
  # -----------------------------
29
- # βœ… IMPROVEMENT: Centralized configuration using environment variables with sensible defaults.
30
  DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
31
- CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_improved.jsonl")
32
  MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
33
- LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "60"))
34
  RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.2"))
35
  TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "5"))
36
  TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "3"))
@@ -38,17 +34,13 @@ TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "3"))
38
  # -----------------------------
39
  # βœ… Initialize FastAPI App
40
  # -----------------------------
41
- app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="1.1.0")
42
 
43
- # βœ… IMPROVEMENT: Middleware to add a unique request ID to each incoming request.
44
- # This helps in tracing a request's entire lifecycle through the logs.
45
  @app.middleware("http")
46
  async def add_request_id(request: Request, call_next):
47
  request_id = str(uuid.uuid4())
48
- # Make the request_id available to the logger
49
  request.state.request_id = request_id
50
  response = await call_next(request)
51
- # Add the request_id to the response headers
52
  response.headers["X-Request-ID"] = request_id
53
  return response
54
 
@@ -63,7 +55,7 @@ try:
63
  relevance_threshold=RELEVANCE_THRESHOLD
64
  )
65
  if not ensure_db_populated(db, CHUNKS_FILE_PATH):
66
- logger.warning("DB not populated on startup. RAG will not function correctly until data is loaded.")
67
  db_ready = False
68
  else:
69
  logger.info("Vector DB is populated and ready.")
@@ -80,11 +72,11 @@ logger.info(f"Loading GGUF model from: {MODEL_PATH}")
80
  try:
81
  llm = Llama(
82
  model_path=MODEL_PATH,
83
- n_ctx=2048, # Context window size
84
- n_threads=4, # Number of CPU threads to use
85
- n_batch=512, # Batch size for prompt processing
86
- use_mlock=True, # Use mlock to keep model in memory
87
- verbose=False # Suppress verbose output from llama.cpp
88
  )
89
  logger.info("GGUF model loaded successfully.")
90
  model_ready = True
@@ -104,24 +96,21 @@ class Feedback(BaseModel):
104
  question: str
105
  answer: str
106
  context_used: str
107
- feedback: str # e.g., "correct", "incorrect", "helpful", "not-helpful"
108
  comment: str | None = None
109
 
110
  # -----------------------------
111
  # βœ… Endpoints
112
  # -----------------------------
113
  def get_logger_adapter(request: Request):
114
- """Helper to get a logger adapter with the current request_id."""
115
  return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
116
 
117
  @app.get("/")
118
  async def root():
119
  return {"status": "βœ… Server is running."}
120
 
121
- # βœ… IMPROVEMENT: Added a health check endpoint for monitoring.
122
  @app.get("/health")
123
  async def health_check():
124
- """Provides a detailed health status of the application components."""
125
  status = {
126
  "status": "ok",
127
  "database_status": "ready" if db_ready else "error",
@@ -131,17 +120,12 @@ async def health_check():
131
  raise HTTPException(status_code=503, detail=status)
132
  return status
133
 
134
- # βœ… IMPROVEMENT: Run synchronous LLM calls in a separate thread to avoid blocking the event loop.
135
  async def generate_llm_response(prompt: str, request_id: str):
136
- """Helper function to run synchronous LLM inference in a thread-safe manner."""
137
  loop = asyncio.get_running_loop()
138
-
139
- # Use to_thread to run the blocking I/O call in a separate thread
140
  response = await loop.run_in_executor(
141
- None, # Use the default thread pool executor
142
- lambda: llm(prompt, max_tokens=1024, stop=["###", "Question:", "Context:"], temperature=0.1, echo=False)
143
  )
144
-
145
  answer = response["choices"][0]["text"].strip()
146
  if not answer:
147
  raise ValueError("Empty response from LLM")
@@ -149,57 +133,93 @@ async def generate_llm_response(prompt: str, request_id: str):
149
 
150
  @app.post("/chat")
151
  async def chat(query: Query, request: Request):
152
- # βœ… IMPROVEMENT: Get a logger adapter with the request ID for this specific request.
153
  adapter = get_logger_adapter(request)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  if not db_ready or not model_ready:
156
  adapter.error("Service unavailable due to initialization failure.")
157
  raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
158
 
159
- question = query.question.strip()
160
- adapter.info(f"Received query: '{question}'")
161
 
162
  # 1. Search Vector DB
163
- adapter.info(f"Searching vector DB with top_k={TOP_K_SEARCH} and threshold={RELEVANCE_THRESHOLD}")
164
- search_results = db.search(question, top_k=TOP_K_SEARCH)
165
 
166
  if not search_results:
167
  adapter.warning("No relevant context found in vector DB.")
168
  return {
169
- "question": question,
170
  "context_used": "No relevant context found.",
171
  "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
172
  }
173
 
174
- # βœ… IMPROVEMENT: Detailed logging of search results.
175
  scores = [f"{result['relevance_score']:.4f}" for result in search_results]
176
  adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
177
 
178
  # 2. Prepare Context
179
  context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
180
  context = "\n---\n".join(context_chunks)
181
- adapter.info(f"Using top {len(context_chunks)} contexts for prompt.")
182
- # For debugging, you can log the full context, but be mindful of log size.
183
- # adapter.debug(f"Full context being used:\n{context}")
184
-
185
  # 3. Build Prompt
186
  prompt = f"""<|system|>
187
- You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP). Your rules are:
188
- 1. **Answer ONLY using the information from the 'Relevant Context' provided.**
189
- 2. **Use Markdown for formatting.** When the context contains a list of items, composition, or steps, present them as bullet points (`*` or `-`). Do not use plain text for lists.
190
- 3. If the answer is in the context, formulate a concise answer based *exclusively* on the text. Cite the relevant clause if possible.
191
- 4. **If the answer is NOT in the context, you MUST reply with: "The provided policy context does not contain information on this topic."** Do not use your own knowledge or guess.</s>
192
- <|user|>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  ### Relevant Context:
 
194
  {context}
 
195
 
196
  ### Question:
197
- {question}</s>
 
 
 
 
 
 
 
198
  <|assistant|>
199
  ### Detailed Answer:
200
  """
201
- adapter.info("Generated prompt for LLM.")
202
- # adapter.debug(f"Full prompt for LLM:\n{prompt}")
203
 
204
  # 4. Generate Response
205
  answer = "An error occurred while processing your request."
@@ -220,7 +240,7 @@ async def chat(query: Query, request: Request):
220
  adapter.info(f"Final answer prepared. Returning to client.")
221
  return {
222
  "request_id": request.state.request_id,
223
- "question": question,
224
  "context_used": context,
225
  "answer": answer
226
  }
@@ -228,7 +248,6 @@ async def chat(query: Query, request: Request):
228
  @app.post("/feedback")
229
  async def collect_feedback(feedback: Feedback, request: Request):
230
  adapter = get_logger_adapter(request)
231
- # βœ… IMPROVEMENT: Log feedback as a structured JSON object for easier parsing and analysis later.
232
  feedback_log = {
233
  "type": "USER_FEEDBACK",
234
  "request_id": feedback.request_id,
 
12
  # -----------------------------
13
  # βœ… Logging Configuration
14
  # -----------------------------
 
15
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] - %(message)s')
16
 
 
17
  class RequestIdAdapter(logging.LoggerAdapter):
18
  def process(self, msg, kwargs):
 
19
  return '[%s] %s' % (self.extra['request_id'], msg), kwargs
20
 
21
  logger = logging.getLogger("app")
 
23
  # -----------------------------
24
  # βœ… Configuration
25
  # -----------------------------
 
26
  DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
27
+ CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
28
  MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
29
+ LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "45"))
30
  RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.2"))
31
  TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "5"))
32
  TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "3"))
 
34
  # -----------------------------
35
  # βœ… Initialize FastAPI App
36
  # -----------------------------
37
+ app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="1.2.0")
38
 
 
 
39
  @app.middleware("http")
40
  async def add_request_id(request: Request, call_next):
41
  request_id = str(uuid.uuid4())
 
42
  request.state.request_id = request_id
43
  response = await call_next(request)
 
44
  response.headers["X-Request-ID"] = request_id
45
  return response
46
 
 
55
  relevance_threshold=RELEVANCE_THRESHOLD
56
  )
57
  if not ensure_db_populated(db, CHUNKS_FILE_PATH):
58
+ logger.warning("DB not populated on startup. RAG will not function correctly.")
59
  db_ready = False
60
  else:
61
  logger.info("Vector DB is populated and ready.")
 
72
  try:
73
  llm = Llama(
74
  model_path=MODEL_PATH,
75
+ n_ctx=2048,
76
+ n_threads=4,
77
+ n_batch=512,
78
+ use_mlock=True,
79
+ verbose=False
80
  )
81
  logger.info("GGUF model loaded successfully.")
82
  model_ready = True
 
96
  question: str
97
  answer: str
98
  context_used: str
99
+ feedback: str
100
  comment: str | None = None
101
 
102
  # -----------------------------
103
  # βœ… Endpoints
104
  # -----------------------------
105
  def get_logger_adapter(request: Request):
 
106
  return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
107
 
108
  @app.get("/")
109
  async def root():
110
  return {"status": "βœ… Server is running."}
111
 
 
112
  @app.get("/health")
113
  async def health_check():
 
114
  status = {
115
  "status": "ok",
116
  "database_status": "ready" if db_ready else "error",
 
120
  raise HTTPException(status_code=503, detail=status)
121
  return status
122
 
 
123
  async def generate_llm_response(prompt: str, request_id: str):
 
124
  loop = asyncio.get_running_loop()
 
 
125
  response = await loop.run_in_executor(
126
+ None,
127
+ lambda: llm(prompt, max_tokens=1024, stop=["###", "Question:", "Context:", "</s>"], temperature=0.1, echo=False)
128
  )
 
129
  answer = response["choices"][0]["text"].strip()
130
  if not answer:
131
  raise ValueError("Empty response from LLM")
 
133
 
134
  @app.post("/chat")
135
  async def chat(query: Query, request: Request):
 
136
  adapter = get_logger_adapter(request)
137
+ question_lower = query.question.strip().lower()
138
+
139
+ # --- NEW: GREETING & INTRO HANDLING ---
140
+ greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
141
+ # Check if the question is a simple greeting
142
+ if question_lower in greeting_keywords:
143
+ adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
144
+ intro_message = (
145
+ "Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
146
+ "My purpose is to help you find accurate information and answer questions based on this specific dataset. "
147
+ "I am currently running on a CPU-based environment. How can I assist you with the DoP policy today?"
148
+ )
149
+ return {
150
+ "request_id": getattr(request.state, 'request_id', 'N/A'),
151
+ "question": query.question,
152
+ "context_used": "N/A - Greeting",
153
+ "answer": intro_message
154
+ }
155
+ # --- END OF GREETING HANDLING ---
156
 
157
  if not db_ready or not model_ready:
158
  adapter.error("Service unavailable due to initialization failure.")
159
  raise HTTPException(status_code=503, detail="Service is not ready. Please check logs.")
160
 
161
+ adapter.info(f"Received query: '{query.question}'")
 
162
 
163
  # 1. Search Vector DB
164
+ search_results = db.search(query.question, top_k=TOP_K_SEARCH)
 
165
 
166
  if not search_results:
167
  adapter.warning("No relevant context found in vector DB.")
168
  return {
169
+ "question": query.question,
170
  "context_used": "No relevant context found.",
171
  "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
172
  }
173
 
 
174
  scores = [f"{result['relevance_score']:.4f}" for result in search_results]
175
  adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
176
 
177
  # 2. Prepare Context
178
  context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
179
  context = "\n---\n".join(context_chunks)
180
+
 
 
 
181
  # 3. Build Prompt
182
  prompt = f"""<|system|>
183
+ You are a precise and factual assistant. Follow the user's instructions exactly as shown in the example.
184
+ ---
185
+ **EXAMPLE START**
186
+
187
+ ### Relevant Context:
188
+ ```json
189
+ {{"section": "Urgent Local Purchases", "title": "Committee for Purchases Below β‚Ή50,000", "clause": "LPC-2", "composition": [{{"Chairman": "Senior Manager"}}, {{"Members": ["One member from Finance", "One member from Indenter side (not below the rank of Deputy Manager)"]}}]}}
190
+ ```
191
+
192
+ ### Question:
193
+ What is the composition of the LPC-2 committee?
194
+
195
+ ### Detailed Answer:
196
+ According to the policy on Urgent Local Purchases (Clause LPC-2), the committee is composed of:
197
+ * **Chairman:** Senior Manager
198
+ * **Members:**
199
+ * One member from Finance
200
+ * One member from the Indenter side (not below the rank of Deputy Manager)
201
+
202
+ **EXAMPLE END**
203
+ ---
204
+ </s>
205
+ <|user|>
206
  ### Relevant Context:
207
+ ```json
208
  {context}
209
+ ```
210
 
211
  ### Question:
212
+ {query.question}
213
+
214
+ ### INSTRUCTIONS FOR YOUR ANSWER:
215
+ 1. Based **ONLY** on the "Relevant Context" above, provide a detailed answer to the "Question".
216
+ 2. If the context contains a list of items, rules, or procedures, you **MUST list ALL of them**. Do not skip or summarize.
217
+ 3. Format your list using Markdown bullet points (`*`).
218
+ 4. If the context does not contain the answer, reply **ONLY** with: "The provided policy context does not contain information on this topic."
219
+ </s>
220
  <|assistant|>
221
  ### Detailed Answer:
222
  """
 
 
223
 
224
  # 4. Generate Response
225
  answer = "An error occurred while processing your request."
 
240
  adapter.info(f"Final answer prepared. Returning to client.")
241
  return {
242
  "request_id": request.state.request_id,
243
+ "question": query.question,
244
  "context_used": context,
245
  "answer": answer
246
  }
 
248
  @app.post("/feedback")
249
  async def collect_feedback(feedback: Feedback, request: Request):
250
  adapter = get_logger_adapter(request)
 
251
  feedback_log = {
252
  "type": "USER_FEEDBACK",
253
  "request_id": feedback.request_id,