Kalpokoch commited on
Commit
fe6354c
·
verified ·
1 Parent(s): b636490

Update app/app.py

Browse files
Files changed (1) hide show
  1. app/app.py +22 -27
app/app.py CHANGED
@@ -1,5 +1,3 @@
1
- # Complete and final app.py
2
-
3
  from fastapi import FastAPI, HTTPException
4
  from pydantic import BaseModel
5
  from llama_cpp import Llama
@@ -53,19 +51,17 @@ else:
53
  logger.info("[INFO] Vector DB ready.")
54
 
55
  # -----------------------------
56
- # ✅ Load Your GGUF Model
57
  # -----------------------------
58
- # <-- UPDATED: Points to the new local model file downloaded in the Dockerfile
59
- MODEL_PATH = "/app/dop-phi-1.5-Q4_K_M.gguf"
60
-
61
-
62
  logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
63
 
64
  llm = Llama(
65
  model_path=MODEL_PATH,
66
- n_ctx=2048,
67
- n_threads=2,
68
- n_gpu_layers=0,
 
69
  verbose=False
70
  )
71
  logger.info("[INFO] Model loaded successfully.")
@@ -79,12 +75,12 @@ class Query(BaseModel):
79
  # -----------------------------
80
  # ✅ Chat Endpoint
81
  # -----------------------------
82
- LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "45"))
83
- logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds.")
84
 
85
  async def generate_llm_response(prompt: str):
86
  """Helper function to run synchronous LLM inference."""
87
- response = llm(prompt, max_tokens=384, stop=["Instruct:", "Output:", "###"], temperature=0.2, echo=False)
88
  answer = response["choices"][0]["text"].strip()
89
  if not answer:
90
  raise ValueError("Empty response from LLM")
@@ -102,34 +98,33 @@ async def chat(query: Query):
102
  reverse=True
103
  )
104
 
 
 
 
105
  if not filtered:
106
  logger.info("[RESPONSE] No relevant context found.")
107
  return {
108
  "question": question,
109
- "context_used": "No relevant context found.",
110
- "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
111
  }
112
 
113
  context = filtered[0]["text"]
114
- logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f})")
115
 
116
- # This prompt format matches how you fine-tuned Phi-1.5
117
- prompt = f"""Instruct: Use the following context to answer the question.
118
- Context: {context}
119
- Question: {question}
120
- Output:"""
121
 
122
- answer = "Sorry, I couldn't process your request right now. Please try again later."
123
  try:
124
  answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
125
  except asyncio.TimeoutError:
126
- logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
127
- answer = "Sorry, the request took too long to process. Please try again with a simpler question."
128
  except Exception as e:
129
- logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)}")
130
- answer = "Sorry, an unexpected error occurred while generating a response."
131
 
132
- logger.info(f"[RESPONSE] Answered: {answer[:100]}...")
133
  return {
134
  "question": question,
135
  "context_used": context,
 
 
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
 
51
  logger.info("[INFO] Vector DB ready.")
52
 
53
  # -----------------------------
54
+ # ✅ Load TinyLlama GGUF Model
55
  # -----------------------------
56
+ MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
 
 
 
57
  logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
58
 
59
  llm = Llama(
60
  model_path=MODEL_PATH,
61
+ n_ctx=512,
62
+ n_threads=1,
63
+ n_batch=8,
64
+ use_mlock=False,
65
  verbose=False
66
  )
67
  logger.info("[INFO] Model loaded successfully.")
 
75
  # -----------------------------
76
  # ✅ Chat Endpoint
77
  # -----------------------------
78
+ LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30"))
79
+ logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds at startup.")
80
 
81
  async def generate_llm_response(prompt: str):
82
  """Helper function to run synchronous LLM inference."""
83
+ response = llm(prompt, max_tokens=384, stop=["###"], temperature=0.2)
84
  answer = response["choices"][0]["text"].strip()
85
  if not answer:
86
  raise ValueError("Empty response from LLM")
 
98
  reverse=True
99
  )
100
 
101
+ for i, r in enumerate(search_results):
102
+ logger.info(f"[DEBUG] Chunk {i+1} | Score: {r['relevance_score']:.4f} | Snippet: {r['text'][:80]}...")
103
+
104
  if not filtered:
105
  logger.info("[RESPONSE] No relevant context found.")
106
  return {
107
  "question": question,
108
+ "context_used": "No relevant context found above the relevance threshold.",
109
+ "answer": "Sorry, I need more detail in the question to provide an answer based on the policies."
110
  }
111
 
112
  context = filtered[0]["text"]
113
+ logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f}): {context[:100]}...")
114
 
115
+ prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
 
 
 
 
116
 
117
+ answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
118
  try:
119
  answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
120
  except asyncio.TimeoutError:
121
+ logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds for question: '{question}'")
122
+ answer = "Sorry, I couldn't process your request right now. The operation took too long. Please try again later or rephrase your question."
123
  except Exception as e:
124
+ logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)} for question: '{question}'")
125
+ answer = "Sorry, I couldn't process your request right now. An unexpected error occurred. Please try again later or rephrase your question."
126
 
127
+ logger.info(f"[RESPONSE] Answered for '{question}': {answer[:100]}...")
128
  return {
129
  "question": question,
130
  "context_used": context,