Spaces:
Running
Running
Update app/app.py
Browse files- app/app.py +22 -27
app/app.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
# Complete and final app.py
|
| 2 |
-
|
| 3 |
from fastapi import FastAPI, HTTPException
|
| 4 |
from pydantic import BaseModel
|
| 5 |
from llama_cpp import Llama
|
|
@@ -53,19 +51,17 @@ else:
|
|
| 53 |
logger.info("[INFO] Vector DB ready.")
|
| 54 |
|
| 55 |
# -----------------------------
|
| 56 |
-
# ✅ Load
|
| 57 |
# -----------------------------
|
| 58 |
-
|
| 59 |
-
MODEL_PATH = "/app/dop-phi-1.5-Q4_K_M.gguf"
|
| 60 |
-
|
| 61 |
-
|
| 62 |
logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
|
| 63 |
|
| 64 |
llm = Llama(
|
| 65 |
model_path=MODEL_PATH,
|
| 66 |
-
n_ctx=
|
| 67 |
-
n_threads=
|
| 68 |
-
|
|
|
|
| 69 |
verbose=False
|
| 70 |
)
|
| 71 |
logger.info("[INFO] Model loaded successfully.")
|
|
@@ -79,12 +75,12 @@ class Query(BaseModel):
|
|
| 79 |
# -----------------------------
|
| 80 |
# ✅ Chat Endpoint
|
| 81 |
# -----------------------------
|
| 82 |
-
LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "
|
| 83 |
-
logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds.")
|
| 84 |
|
| 85 |
async def generate_llm_response(prompt: str):
|
| 86 |
"""Helper function to run synchronous LLM inference."""
|
| 87 |
-
response = llm(prompt, max_tokens=384, stop=["
|
| 88 |
answer = response["choices"][0]["text"].strip()
|
| 89 |
if not answer:
|
| 90 |
raise ValueError("Empty response from LLM")
|
|
@@ -102,34 +98,33 @@ async def chat(query: Query):
|
|
| 102 |
reverse=True
|
| 103 |
)
|
| 104 |
|
|
|
|
|
|
|
|
|
|
| 105 |
if not filtered:
|
| 106 |
logger.info("[RESPONSE] No relevant context found.")
|
| 107 |
return {
|
| 108 |
"question": question,
|
| 109 |
-
"context_used": "No relevant context found.",
|
| 110 |
-
"answer": "Sorry, I
|
| 111 |
}
|
| 112 |
|
| 113 |
context = filtered[0]["text"]
|
| 114 |
-
logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f})")
|
| 115 |
|
| 116 |
-
|
| 117 |
-
prompt = f"""Instruct: Use the following context to answer the question.
|
| 118 |
-
Context: {context}
|
| 119 |
-
Question: {question}
|
| 120 |
-
Output:"""
|
| 121 |
|
| 122 |
-
answer = "Sorry, I couldn't process your request right now. Please try again later."
|
| 123 |
try:
|
| 124 |
answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
|
| 125 |
except asyncio.TimeoutError:
|
| 126 |
-
logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds
|
| 127 |
-
answer = "Sorry,
|
| 128 |
except Exception as e:
|
| 129 |
-
logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)}")
|
| 130 |
-
answer = "Sorry,
|
| 131 |
|
| 132 |
-
logger.info(f"[RESPONSE] Answered: {answer[:100]}...")
|
| 133 |
return {
|
| 134 |
"question": question,
|
| 135 |
"context_used": context,
|
|
|
|
|
|
|
|
|
|
| 1 |
from fastapi import FastAPI, HTTPException
|
| 2 |
from pydantic import BaseModel
|
| 3 |
from llama_cpp import Llama
|
|
|
|
| 51 |
logger.info("[INFO] Vector DB ready.")
|
| 52 |
|
| 53 |
# -----------------------------
|
| 54 |
+
# ✅ Load TinyLlama GGUF Model
|
| 55 |
# -----------------------------
|
| 56 |
+
MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
|
|
|
|
|
|
|
|
|
|
| 57 |
logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
|
| 58 |
|
| 59 |
llm = Llama(
|
| 60 |
model_path=MODEL_PATH,
|
| 61 |
+
n_ctx=512,
|
| 62 |
+
n_threads=1,
|
| 63 |
+
n_batch=8,
|
| 64 |
+
use_mlock=False,
|
| 65 |
verbose=False
|
| 66 |
)
|
| 67 |
logger.info("[INFO] Model loaded successfully.")
|
|
|
|
| 75 |
# -----------------------------
|
| 76 |
# ✅ Chat Endpoint
|
| 77 |
# -----------------------------
|
| 78 |
+
LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30"))
|
| 79 |
+
logger.info(f"[INFO] LLM_TIMEOUT_SECONDS set to: {LLM_TIMEOUT_SECONDS} seconds at startup.")
|
| 80 |
|
| 81 |
async def generate_llm_response(prompt: str):
|
| 82 |
"""Helper function to run synchronous LLM inference."""
|
| 83 |
+
response = llm(prompt, max_tokens=384, stop=["###"], temperature=0.2)
|
| 84 |
answer = response["choices"][0]["text"].strip()
|
| 85 |
if not answer:
|
| 86 |
raise ValueError("Empty response from LLM")
|
|
|
|
| 98 |
reverse=True
|
| 99 |
)
|
| 100 |
|
| 101 |
+
for i, r in enumerate(search_results):
|
| 102 |
+
logger.info(f"[DEBUG] Chunk {i+1} | Score: {r['relevance_score']:.4f} | Snippet: {r['text'][:80]}...")
|
| 103 |
+
|
| 104 |
if not filtered:
|
| 105 |
logger.info("[RESPONSE] No relevant context found.")
|
| 106 |
return {
|
| 107 |
"question": question,
|
| 108 |
+
"context_used": "No relevant context found above the relevance threshold.",
|
| 109 |
+
"answer": "Sorry, I need more detail in the question to provide an answer based on the policies."
|
| 110 |
}
|
| 111 |
|
| 112 |
context = filtered[0]["text"]
|
| 113 |
+
logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f}): {context[:100]}...")
|
| 114 |
|
| 115 |
+
prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
|
| 118 |
try:
|
| 119 |
answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
|
| 120 |
except asyncio.TimeoutError:
|
| 121 |
+
logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds for question: '{question}'")
|
| 122 |
+
answer = "Sorry, I couldn't process your request right now. The operation took too long. Please try again later or rephrase your question."
|
| 123 |
except Exception as e:
|
| 124 |
+
logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)} for question: '{question}'")
|
| 125 |
+
answer = "Sorry, I couldn't process your request right now. An unexpected error occurred. Please try again later or rephrase your question."
|
| 126 |
|
| 127 |
+
logger.info(f"[RESPONSE] Answered for '{question}': {answer[:100]}...")
|
| 128 |
return {
|
| 129 |
"question": question,
|
| 130 |
"context_used": context,
|