Kalpokoch commited on
Commit
72e44d7
·
verified ·
1 Parent(s): f65ed48

Update app/app.py

Browse files
Files changed (1) hide show
  1. app/app.py +35 -31
app/app.py CHANGED
@@ -4,12 +4,12 @@ from llama_cpp import Llama
4
  import logging
5
  from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
6
  import asyncio
7
- import os # Import os to check for environment variables
8
 
9
  # -----------------------------
10
  # ✅ Logging Configuration
11
  # -----------------------------
12
- logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger("app")
14
 
15
  # -----------------------------
@@ -46,7 +46,7 @@ db = PolicyVectorDB(
46
  relevance_threshold=0 # Low for more inclusive matching
47
  )
48
  if not ensure_db_populated(db, CHUNKS_FILE_PATH):
49
- logger.warning("[WARNING] DB not populated. Chunks file may be missing.")
50
  else:
51
  logger.info("[INFO] Vector DB ready.")
52
 
@@ -55,11 +55,16 @@ else:
55
  # -----------------------------
56
  MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
57
  logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
 
 
 
 
 
58
  llm = Llama(
59
  model_path=MODEL_PATH,
60
- n_ctx=512, # Lower for faster inference
61
- n_threads=2,
62
- n_batch=16,
63
  use_mlock=False,
64
  verbose=False
65
  )
@@ -76,24 +81,20 @@ class Query(BaseModel):
76
  # -----------------------------
77
 
78
  # Define a reasonable timeout for LLM inference (e.g., 30 seconds)
79
- # You can make this configurable via environment variable if needed
80
  LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30")) # Default to 30 seconds
81
 
82
  async def generate_llm_response(prompt: str):
83
- """Helper function to run LLM inference, allowing it to be awaited with a timeout."""
84
- try:
85
- # llama_cpp.Llama's __call__ method is synchronous.
86
- # To make it awaitable for asyncio.wait_for, we can run it in a thread pool.
87
- # FastAPI's async def handles this automatically by running synchronous
88
- # code in a default thread pool.
89
- response = llm(prompt, max_tokens=128, stop=["###"], temperature=0.2)
90
- answer = response["choices"][0]["text"].strip()
91
- if not answer:
92
- raise ValueError("Empty response from LLM")
93
- return answer
94
- except Exception as e:
95
- logger.error(f"[ERROR] LLM generation failed: {str(e)}")
96
- raise # Re-raise to be caught by the timeout mechanism or outer try/except
97
 
98
  @app.post("/chat")
99
  async def chat(query: Query):
@@ -110,18 +111,20 @@ async def chat(query: Query):
110
  reverse=True
111
  )
112
 
113
- # 🪵 Log context scores
114
  for i, r in enumerate(search_results):
115
- logger.info(f"[DEBUG] Chunk {i+1} | Score: {r['relevance_score']:.4f} | Snippet: {r['text'][:80]}")
116
 
117
  if not filtered:
 
118
  return {
119
  "question": question,
120
  "context_used": "No relevant context found above the relevance threshold.",
121
- "answer": "Sorry, I need more detail in the question to provide an answer."
122
  }
123
 
124
  context = filtered[0]["text"]
 
125
 
126
  # ✨ Prompt Template
127
  prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
@@ -129,18 +132,19 @@ async def chat(query: Query):
129
  # 🔮 Run LLM with safety and timeout
130
  answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
131
  try:
132
- # We use asyncio.wait_for to enforce a timeout on the LLM generation.
133
- # If generate_llm_response doesn't complete within LLM_TIMEOUT_SECONDS,
134
- # asyncio.TimeoutError will be raised.
135
  answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
136
  except asyncio.TimeoutError:
137
- logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds for question: {question}")
138
  answer = "Sorry, I couldn't process your request right now. The operation took too long. Please try again later or rephrase your question."
 
 
 
139
  except Exception as e:
140
- logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)}")
141
- answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
142
 
143
- logger.info(f"[RESPONSE] {answer}")
144
  return {
145
  "question": question,
146
  "context_used": context,
 
4
  import logging
5
  from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
6
  import asyncio
7
+ import os
8
 
9
  # -----------------------------
10
  # ✅ Logging Configuration
11
  # -----------------------------
12
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
13
  logger = logging.getLogger("app")
14
 
15
  # -----------------------------
 
46
  relevance_threshold=0 # Low for more inclusive matching
47
  )
48
  if not ensure_db_populated(db, CHUNKS_FILE_PATH):
49
+ logger.warning("[WARNING] DB not populated. Chunks file may be missing or empty. RAG will not function correctly.")
50
  else:
51
  logger.info("[INFO] Vector DB ready.")
52
 
 
55
  # -----------------------------
56
  MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
57
  logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
58
+
59
+ # --- IMPORTANT: Experiment with these Llama parameters based on your CPU performance ---
60
+ # n_ctx: Context window size. Adjust if your typical prompts are significantly shorter.
61
+ # n_threads: Number of CPU threads. Experiment with 1, 2, or 4 for best results on your specific CPU.
62
+ # n_batch: Batch size for internal processing. Experiment with 4, 8, or 16 for latency vs. throughput.
63
  llm = Llama(
64
  model_path=MODEL_PATH,
65
+ n_ctx=512, # Keep at 512, or reduce if your context is always short, e.g., 384
66
+ n_threads=2, # <--- EXPERIMENT HERE (try 1, 2, or 4)
67
+ n_batch=16, # <--- EXPERIMENT HERE (try 4, 8, or 16)
68
  use_mlock=False,
69
  verbose=False
70
  )
 
81
  # -----------------------------
82
 
83
  # Define a reasonable timeout for LLM inference (e.g., 30 seconds)
84
+ # This can be configured via an environment variable in Hugging Face Spaces.
85
  LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30")) # Default to 30 seconds
86
 
87
  async def generate_llm_response(prompt: str):
88
+ """
89
+ Helper function to run synchronous LLM inference.
90
+ FastAPI's async def automatically runs blocking code in a thread pool,
91
+ making it compatible with asyncio.wait_for.
92
+ """
93
+ response = llm(prompt, max_tokens=128, stop=["###"], temperature=0.2)
94
+ answer = response["choices"][0]["text"].strip()
95
+ if not answer:
96
+ raise ValueError("Empty response from LLM")
97
+ return answer
 
 
 
 
98
 
99
  @app.post("/chat")
100
  async def chat(query: Query):
 
111
  reverse=True
112
  )
113
 
114
+ # 🪵 Log context scores for debugging
115
  for i, r in enumerate(search_results):
116
+ logger.info(f"[DEBUG] Chunk {i+1} | Score: {r['relevance_score']:.4f} | Snippet: {r['text'][:80]}...")
117
 
118
  if not filtered:
119
+ logger.info("[RESPONSE] No relevant context found.")
120
  return {
121
  "question": question,
122
  "context_used": "No relevant context found above the relevance threshold.",
123
+ "answer": "Sorry, I need more detail in the question to provide an answer based on the policies."
124
  }
125
 
126
  context = filtered[0]["text"]
127
+ logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f}): {context[:100]}...")
128
 
129
  # ✨ Prompt Template
130
  prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
 
132
  # 🔮 Run LLM with safety and timeout
133
  answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
134
  try:
135
+ # Enforce timeout for LLM generation
 
 
136
  answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
137
  except asyncio.TimeoutError:
138
+ logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds for question: '{question}'")
139
  answer = "Sorry, I couldn't process your request right now. The operation took too long. Please try again later or rephrase your question."
140
+ except ValueError as e: # Catch explicit ValueError from generate_llm_response (e.g., empty response)
141
+ logger.error(f"[ERROR] LLM generation returned an invalid response: {str(e)} for question: '{question}'")
142
+ answer = "Sorry, I couldn't process your request right now. The model returned an invalid answer. Please try again later or rephrase your question."
143
  except Exception as e:
144
+ logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)} for question: '{question}'")
145
+ answer = "Sorry, I couldn't process your request right now. An unexpected error occurred. Please try again later or rephrase your question."
146
 
147
+ logger.info(f"[RESPONSE] Answered for '{question}': {answer[:100]}...") # Log beginning of answer
148
  return {
149
  "question": question,
150
  "context_used": context,