Commit
·
4af310b
1
Parent(s):
e65991a
RAG v2: HNSW ANN, cross-encoder reranking, evaluation logging, analytics dashboard
Browse files- analytics.py +81 -0
- analyze_logs.py +47 -0
- eval_logger.py +21 -0
- frontend/analytics.html +334 -0
- frontend/index.html +171 -96
- main.py +95 -6
- rag_eval_logs.jsonl +23 -0
- rag_store.py +122 -116
- render.yaml +0 -12
analytics.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from collections import defaultdict
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
LOG_FILE = "rag_eval_logs.jsonl"
|
| 6 |
+
|
| 7 |
+
def get_analytics():
|
| 8 |
+
"""Parse logs and return analytics data."""
|
| 9 |
+
total = 0
|
| 10 |
+
known_count = 0
|
| 11 |
+
unknown_count = 0
|
| 12 |
+
conf_sum = 0.0
|
| 13 |
+
queries = []
|
| 14 |
+
unknown_queries = []
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
| 18 |
+
for line in f:
|
| 19 |
+
line = line.strip()
|
| 20 |
+
if not line:
|
| 21 |
+
continue
|
| 22 |
+
|
| 23 |
+
total += 1
|
| 24 |
+
data = json.loads(line)
|
| 25 |
+
|
| 26 |
+
if data.get("answer_known"):
|
| 27 |
+
known_count += 1
|
| 28 |
+
else:
|
| 29 |
+
unknown_count += 1
|
| 30 |
+
unknown_queries.append({
|
| 31 |
+
"query": data.get("query"),
|
| 32 |
+
"timestamp": datetime.fromtimestamp(data.get("timestamp", 0)).strftime("%Y-%m-%d %H:%M")
|
| 33 |
+
})
|
| 34 |
+
|
| 35 |
+
conf_sum += data.get("confidence", 0.0)
|
| 36 |
+
queries.append({
|
| 37 |
+
"query": data.get("query"),
|
| 38 |
+
"confidence": data.get("confidence", 0.0),
|
| 39 |
+
"answer_known": data.get("answer_known", False)
|
| 40 |
+
})
|
| 41 |
+
|
| 42 |
+
if total == 0:
|
| 43 |
+
return {
|
| 44 |
+
"total_queries": 0,
|
| 45 |
+
"knowledge_rate": 0,
|
| 46 |
+
"avg_confidence": 0,
|
| 47 |
+
"known_count": 0,
|
| 48 |
+
"unknown_count": 0,
|
| 49 |
+
"recent_unknown": [],
|
| 50 |
+
"top_queries": []
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
knowledge_rate = (known_count / total) * 100
|
| 54 |
+
avg_confidence = conf_sum / total
|
| 55 |
+
|
| 56 |
+
# Get top 10 most recent queries
|
| 57 |
+
top_queries = queries[-10:][::-1] # Last 10, reversed
|
| 58 |
+
|
| 59 |
+
# Get recent unknown queries (last 5)
|
| 60 |
+
recent_unknown = unknown_queries[-5:][::-1]
|
| 61 |
+
|
| 62 |
+
return {
|
| 63 |
+
"total_queries": total,
|
| 64 |
+
"knowledge_rate": round(knowledge_rate, 1),
|
| 65 |
+
"avg_confidence": round(avg_confidence, 2),
|
| 66 |
+
"known_count": known_count,
|
| 67 |
+
"unknown_count": unknown_count,
|
| 68 |
+
"recent_unknown": recent_unknown,
|
| 69 |
+
"top_queries": top_queries
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
except FileNotFoundError:
|
| 73 |
+
return {
|
| 74 |
+
"total_queries": 0,
|
| 75 |
+
"knowledge_rate": 0,
|
| 76 |
+
"avg_confidence": 0,
|
| 77 |
+
"known_count": 0,
|
| 78 |
+
"unknown_count": 0,
|
| 79 |
+
"recent_unknown": [],
|
| 80 |
+
"top_queries": []
|
| 81 |
+
}
|
analyze_logs.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from collections import Counter
|
| 3 |
+
|
| 4 |
+
LOG_FILE = "rag_eval_logs.jsonl"
|
| 5 |
+
|
| 6 |
+
def analyze():
|
| 7 |
+
print(f"--- Analyzing {LOG_FILE} ---\n")
|
| 8 |
+
|
| 9 |
+
total = 0
|
| 10 |
+
known_count = 0
|
| 11 |
+
unknown_count = 0
|
| 12 |
+
conf_sum = 0.0
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
| 16 |
+
for line in f:
|
| 17 |
+
line = line.strip()
|
| 18 |
+
if not line: continue
|
| 19 |
+
|
| 20 |
+
total += 1
|
| 21 |
+
data = json.loads(line)
|
| 22 |
+
|
| 23 |
+
if data.get("answer_known"):
|
| 24 |
+
known_count += 1
|
| 25 |
+
else:
|
| 26 |
+
unknown_count += 1
|
| 27 |
+
|
| 28 |
+
conf_sum += data.get("confidence", 0.0)
|
| 29 |
+
|
| 30 |
+
if total == 0:
|
| 31 |
+
print("No logs found.")
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
print(f"Total Queries: {total}")
|
| 35 |
+
print(f"Answered (Known): {known_count}")
|
| 36 |
+
print(f"Unanswered (False): {unknown_count}")
|
| 37 |
+
print(f"Average Confidence: {conf_sum / total:.2f}")
|
| 38 |
+
print("-" * 30)
|
| 39 |
+
|
| 40 |
+
accuracy = (known_count / total) * 100
|
| 41 |
+
print(f"System 'Knowledge Rate': {accuracy:.1f}%")
|
| 42 |
+
|
| 43 |
+
except FileNotFoundError:
|
| 44 |
+
print(f"Log file {LOG_FILE} not found.")
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
analyze()
|
eval_logger.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from time import time
|
| 3 |
+
|
| 4 |
+
LOG_FILE = "rag_eval_logs.jsonl"
|
| 5 |
+
|
| 6 |
+
def log_eval(
|
| 7 |
+
query: str,
|
| 8 |
+
retrieved_count: int,
|
| 9 |
+
confidence: float,
|
| 10 |
+
answer_known: bool
|
| 11 |
+
):
|
| 12 |
+
record = {
|
| 13 |
+
"timestamp": time(),
|
| 14 |
+
"query": query,
|
| 15 |
+
"retrieved_count": retrieved_count,
|
| 16 |
+
"confidence": confidence,
|
| 17 |
+
"answer_known": answer_known
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
| 21 |
+
f.write(json.dumps(record) + "\n")
|
frontend/analytics.html
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8" />
|
| 6 |
+
<title>Analytics - Gemini RAG</title>
|
| 7 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 8 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
|
| 9 |
+
|
| 10 |
+
<style>
|
| 11 |
+
:root {
|
| 12 |
+
--bg: radial-gradient(1200px 600px at top, #e0e7ff 0%, #f8fafc 60%);
|
| 13 |
+
--card: rgba(255, 255, 255, 0.9);
|
| 14 |
+
--border: rgba(15, 23, 42, 0.08);
|
| 15 |
+
--primary: #4f46e5;
|
| 16 |
+
--secondary: #0ea5e9;
|
| 17 |
+
--text: #0f172a;
|
| 18 |
+
--muted: #64748b;
|
| 19 |
+
--success: #16a34a;
|
| 20 |
+
--error: #dc2626;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
[data-theme="dark"] {
|
| 24 |
+
--bg: radial-gradient(1200px 600px at top, #1e1b4b 0%, #0f172a 60%);
|
| 25 |
+
--card: rgba(30, 41, 59, 0.9);
|
| 26 |
+
--border: rgba(148, 163, 184, 0.1);
|
| 27 |
+
--primary: #818cf8;
|
| 28 |
+
--secondary: #38bdf8;
|
| 29 |
+
--text: #f1f5f9;
|
| 30 |
+
--muted: #94a3b8;
|
| 31 |
+
--success: #4ade80;
|
| 32 |
+
--error: #f87171;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
* {
|
| 36 |
+
box-sizing: border-box;
|
| 37 |
+
font-family: Inter, sans-serif;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
body {
|
| 41 |
+
margin: 0;
|
| 42 |
+
min-height: 100vh;
|
| 43 |
+
background: var(--bg);
|
| 44 |
+
padding: 40px 16px;
|
| 45 |
+
color: var(--text);
|
| 46 |
+
transition: background 0.3s ease, color 0.3s ease;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.container {
|
| 50 |
+
max-width: 1200px;
|
| 51 |
+
margin: 0 auto;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
.header {
|
| 55 |
+
display: flex;
|
| 56 |
+
justify-content: space-between;
|
| 57 |
+
align-items: center;
|
| 58 |
+
margin-bottom: 32px;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
h1 {
|
| 62 |
+
font-size: 2.2rem;
|
| 63 |
+
margin: 0;
|
| 64 |
+
font-weight: 700;
|
| 65 |
+
background: linear-gradient(135deg, #4f46e5, #06b6d4);
|
| 66 |
+
background-clip: text;
|
| 67 |
+
-webkit-background-clip: text;
|
| 68 |
+
-webkit-text-fill-color: transparent;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
.back-btn {
|
| 72 |
+
padding: 10px 20px;
|
| 73 |
+
background: var(--primary);
|
| 74 |
+
color: white;
|
| 75 |
+
text-decoration: none;
|
| 76 |
+
border-radius: 12px;
|
| 77 |
+
font-weight: 600;
|
| 78 |
+
transition: transform 0.2s ease;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
.back-btn:hover {
|
| 82 |
+
transform: translateY(-2px);
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
.stats-grid {
|
| 86 |
+
display: grid;
|
| 87 |
+
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
|
| 88 |
+
gap: 20px;
|
| 89 |
+
margin-bottom: 32px;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
.stat-card {
|
| 93 |
+
background: var(--card);
|
| 94 |
+
backdrop-filter: blur(16px);
|
| 95 |
+
border-radius: 18px;
|
| 96 |
+
padding: 24px;
|
| 97 |
+
border: 1px solid var(--border);
|
| 98 |
+
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
.stat-label {
|
| 102 |
+
font-size: 0.85rem;
|
| 103 |
+
color: var(--muted);
|
| 104 |
+
margin-bottom: 8px;
|
| 105 |
+
text-transform: uppercase;
|
| 106 |
+
letter-spacing: 0.5px;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
.stat-value {
|
| 110 |
+
font-size: 2.5rem;
|
| 111 |
+
font-weight: 700;
|
| 112 |
+
color: var(--primary);
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
.card {
|
| 116 |
+
background: var(--card);
|
| 117 |
+
backdrop-filter: blur(16px);
|
| 118 |
+
border-radius: 18px;
|
| 119 |
+
padding: 28px;
|
| 120 |
+
border: 1px solid var(--border);
|
| 121 |
+
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
|
| 122 |
+
margin-bottom: 24px;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
.card h2 {
|
| 126 |
+
margin-top: 0;
|
| 127 |
+
margin-bottom: 20px;
|
| 128 |
+
font-size: 1.3rem;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
table {
|
| 132 |
+
width: 100%;
|
| 133 |
+
border-collapse: collapse;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
th,
|
| 137 |
+
td {
|
| 138 |
+
text-align: left;
|
| 139 |
+
padding: 12px;
|
| 140 |
+
border-bottom: 1px solid var(--border);
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
th {
|
| 144 |
+
font-weight: 600;
|
| 145 |
+
color: var(--muted);
|
| 146 |
+
font-size: 0.85rem;
|
| 147 |
+
text-transform: uppercase;
|
| 148 |
+
letter-spacing: 0.5px;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
.badge {
|
| 152 |
+
display: inline-block;
|
| 153 |
+
padding: 4px 10px;
|
| 154 |
+
border-radius: 12px;
|
| 155 |
+
font-size: 0.75rem;
|
| 156 |
+
font-weight: 600;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
.badge-success {
|
| 160 |
+
background: #dcfce7;
|
| 161 |
+
color: #166534;
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
.badge-error {
|
| 165 |
+
background: #fee2e2;
|
| 166 |
+
color: #991b1b;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
.theme-toggle {
|
| 170 |
+
position: fixed;
|
| 171 |
+
top: 20px;
|
| 172 |
+
right: 20px;
|
| 173 |
+
background: var(--card);
|
| 174 |
+
border: 1px solid var(--border);
|
| 175 |
+
border-radius: 12px;
|
| 176 |
+
padding: 10px;
|
| 177 |
+
cursor: pointer;
|
| 178 |
+
font-size: 1.4rem;
|
| 179 |
+
transition: transform 0.2s ease;
|
| 180 |
+
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
.theme-toggle:hover {
|
| 184 |
+
transform: scale(1.1);
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
.empty-state {
|
| 188 |
+
text-align: center;
|
| 189 |
+
padding: 60px 20px;
|
| 190 |
+
color: var(--muted);
|
| 191 |
+
}
|
| 192 |
+
</style>
|
| 193 |
+
</head>
|
| 194 |
+
|
| 195 |
+
<body>
|
| 196 |
+
<button class="theme-toggle" onclick="toggleTheme()" title="Toggle dark mode">🌙</button>
|
| 197 |
+
|
| 198 |
+
<div class="container">
|
| 199 |
+
<div class="header">
|
| 200 |
+
<h1>📊 Analytics Dashboard</h1>
|
| 201 |
+
<a href="/" class="back-btn">← Back to RAG</a>
|
| 202 |
+
</div>
|
| 203 |
+
|
| 204 |
+
<div id="stats-container">
|
| 205 |
+
<div class="empty-state">
|
| 206 |
+
<h2>Loading analytics...</h2>
|
| 207 |
+
</div>
|
| 208 |
+
</div>
|
| 209 |
+
</div>
|
| 210 |
+
|
| 211 |
+
<script>
|
| 212 |
+
// ===== THEME TOGGLE =====
|
| 213 |
+
function toggleTheme() {
|
| 214 |
+
const html = document.documentElement;
|
| 215 |
+
const currentTheme = html.getAttribute('data-theme');
|
| 216 |
+
const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
|
| 217 |
+
|
| 218 |
+
html.setAttribute('data-theme', newTheme);
|
| 219 |
+
localStorage.setItem('theme', newTheme);
|
| 220 |
+
|
| 221 |
+
const btn = document.querySelector('.theme-toggle');
|
| 222 |
+
btn.textContent = newTheme === 'dark' ? '☀️' : '🌙';
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
// Load saved theme
|
| 226 |
+
(function () {
|
| 227 |
+
const savedTheme = localStorage.getItem('theme') || 'light';
|
| 228 |
+
document.documentElement.setAttribute('data-theme', savedTheme);
|
| 229 |
+
const btn = document.querySelector('.theme-toggle');
|
| 230 |
+
if (btn) btn.textContent = savedTheme === 'dark' ? '☀️' : '🌙';
|
| 231 |
+
})();
|
| 232 |
+
|
| 233 |
+
// ===== LOAD ANALYTICS =====
|
| 234 |
+
async function loadAnalytics() {
|
| 235 |
+
try {
|
| 236 |
+
const res = await fetch('/analytics');
|
| 237 |
+
const data = await res.json();
|
| 238 |
+
|
| 239 |
+
if (data.total_queries === 0) {
|
| 240 |
+
document.getElementById('stats-container').innerHTML = `
|
| 241 |
+
<div class="empty-state">
|
| 242 |
+
<h2>No data yet</h2>
|
| 243 |
+
<p>Start asking questions to see analytics!</p>
|
| 244 |
+
</div>
|
| 245 |
+
`;
|
| 246 |
+
return;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
const html = `
|
| 250 |
+
<div class="stats-grid">
|
| 251 |
+
<div class="stat-card">
|
| 252 |
+
<div class="stat-label">Total Queries</div>
|
| 253 |
+
<div class="stat-value">${data.total_queries}</div>
|
| 254 |
+
</div>
|
| 255 |
+
<div class="stat-card">
|
| 256 |
+
<div class="stat-label">Knowledge Rate</div>
|
| 257 |
+
<div class="stat-value">${data.knowledge_rate}%</div>
|
| 258 |
+
</div>
|
| 259 |
+
<div class="stat-card">
|
| 260 |
+
<div class="stat-label">Avg Confidence</div>
|
| 261 |
+
<div class="stat-value">${(data.avg_confidence * 100).toFixed(0)}%</div>
|
| 262 |
+
</div>
|
| 263 |
+
<div class="stat-card">
|
| 264 |
+
<div class="stat-label">Unknown Queries</div>
|
| 265 |
+
<div class="stat-value" style="color: var(--error)">${data.unknown_count}</div>
|
| 266 |
+
</div>
|
| 267 |
+
</div>
|
| 268 |
+
|
| 269 |
+
<div class="card">
|
| 270 |
+
<h2>Recent Queries</h2>
|
| 271 |
+
<table>
|
| 272 |
+
<thead>
|
| 273 |
+
<tr>
|
| 274 |
+
<th>Query</th>
|
| 275 |
+
<th>Confidence</th>
|
| 276 |
+
<th>Status</th>
|
| 277 |
+
</tr>
|
| 278 |
+
</thead>
|
| 279 |
+
<tbody>
|
| 280 |
+
${data.top_queries.map(q => `
|
| 281 |
+
<tr>
|
| 282 |
+
<td>${q.query}</td>
|
| 283 |
+
<td>${(q.confidence * 100).toFixed(0)}%</td>
|
| 284 |
+
<td>
|
| 285 |
+
<span class="badge ${q.answer_known ? 'badge-success' : 'badge-error'}">
|
| 286 |
+
${q.answer_known ? 'Known' : 'Unknown'}
|
| 287 |
+
</span>
|
| 288 |
+
</td>
|
| 289 |
+
</tr>
|
| 290 |
+
`).join('')}
|
| 291 |
+
</tbody>
|
| 292 |
+
</table>
|
| 293 |
+
</div>
|
| 294 |
+
|
| 295 |
+
${data.recent_unknown.length > 0 ? `
|
| 296 |
+
<div class="card">
|
| 297 |
+
<h2>Recent "I Don't Know" Queries</h2>
|
| 298 |
+
<table>
|
| 299 |
+
<thead>
|
| 300 |
+
<tr>
|
| 301 |
+
<th>Query</th>
|
| 302 |
+
<th>Time</th>
|
| 303 |
+
</tr>
|
| 304 |
+
</thead>
|
| 305 |
+
<tbody>
|
| 306 |
+
${data.recent_unknown.map(q => `
|
| 307 |
+
<tr>
|
| 308 |
+
<td>${q.query}</td>
|
| 309 |
+
<td>${q.timestamp}</td>
|
| 310 |
+
</tr>
|
| 311 |
+
`).join('')}
|
| 312 |
+
</tbody>
|
| 313 |
+
</table>
|
| 314 |
+
</div>
|
| 315 |
+
` : ''}
|
| 316 |
+
`;
|
| 317 |
+
|
| 318 |
+
document.getElementById('stats-container').innerHTML = html;
|
| 319 |
+
} catch (e) {
|
| 320 |
+
document.getElementById('stats-container').innerHTML = `
|
| 321 |
+
<div class="empty-state">
|
| 322 |
+
<h2>Error loading analytics</h2>
|
| 323 |
+
<p>${e.message}</p>
|
| 324 |
+
</div>
|
| 325 |
+
`;
|
| 326 |
+
}
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
// Load on page load
|
| 330 |
+
loadAnalytics();
|
| 331 |
+
</script>
|
| 332 |
+
</body>
|
| 333 |
+
|
| 334 |
+
</html>
|
frontend/index.html
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
<!DOCTYPE html>
|
| 2 |
<html lang="en">
|
|
|
|
| 3 |
<head>
|
| 4 |
<meta charset="UTF-8" />
|
| 5 |
<title>Gemini RAG Assistant</title>
|
|
@@ -10,8 +11,8 @@
|
|
| 10 |
<style>
|
| 11 |
:root {
|
| 12 |
--bg: radial-gradient(1200px 600px at top, #e0e7ff 0%, #f8fafc 60%);
|
| 13 |
-
--card: rgba(255,255,255,0.9);
|
| 14 |
-
--border: rgba(15,23,42,0.08);
|
| 15 |
--primary: #4f46e5;
|
| 16 |
--secondary: #0ea5e9;
|
| 17 |
--text: #0f172a;
|
|
@@ -20,7 +21,22 @@
|
|
| 20 |
--success: #16a34a;
|
| 21 |
}
|
| 22 |
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
body {
|
| 26 |
margin: 0;
|
|
@@ -30,6 +46,7 @@
|
|
| 30 |
justify-content: center;
|
| 31 |
padding: 40px 16px;
|
| 32 |
color: var(--text);
|
|
|
|
| 33 |
}
|
| 34 |
|
| 35 |
.container {
|
|
@@ -40,7 +57,7 @@
|
|
| 40 |
border-radius: 24px;
|
| 41 |
padding: 36px;
|
| 42 |
border: 1px solid var(--border);
|
| 43 |
-
box-shadow: 0 40px 120px rgba(15,23,42
|
| 44 |
}
|
| 45 |
|
| 46 |
h1 {
|
|
@@ -48,6 +65,7 @@
|
|
| 48 |
margin: 0;
|
| 49 |
font-weight: 700;
|
| 50 |
background: linear-gradient(135deg, #4f46e5, #06b6d4);
|
|
|
|
| 51 |
-webkit-background-clip: text;
|
| 52 |
-webkit-text-fill-color: transparent;
|
| 53 |
}
|
|
@@ -60,7 +78,7 @@
|
|
| 60 |
|
| 61 |
.card {
|
| 62 |
margin-top: 28px;
|
| 63 |
-
background:
|
| 64 |
border-radius: 18px;
|
| 65 |
padding: 24px;
|
| 66 |
border: 1px solid var(--border);
|
|
@@ -72,13 +90,15 @@
|
|
| 72 |
font-size: 1.1rem;
|
| 73 |
}
|
| 74 |
|
| 75 |
-
input[type="file"],
|
|
|
|
| 76 |
width: 100%;
|
| 77 |
padding: 14px;
|
| 78 |
border-radius: 14px;
|
| 79 |
border: 1px solid var(--border);
|
| 80 |
font-size: 0.95rem;
|
| 81 |
-
background:
|
|
|
|
| 82 |
}
|
| 83 |
|
| 84 |
textarea {
|
|
@@ -104,7 +124,9 @@
|
|
| 104 |
transition: all .2s ease;
|
| 105 |
}
|
| 106 |
|
| 107 |
-
button.secondary {
|
|
|
|
|
|
|
| 108 |
|
| 109 |
button:disabled {
|
| 110 |
opacity: .5;
|
|
@@ -113,7 +135,7 @@
|
|
| 113 |
|
| 114 |
button:hover:not(:disabled) {
|
| 115 |
transform: translateY(-1px);
|
| 116 |
-
box-shadow: 0 4px 12px rgba(79,70,229
|
| 117 |
}
|
| 118 |
|
| 119 |
.status {
|
|
@@ -126,10 +148,11 @@
|
|
| 126 |
margin-top: 24px;
|
| 127 |
padding: 22px;
|
| 128 |
border-radius: 16px;
|
| 129 |
-
background:
|
| 130 |
border: 1px solid var(--border);
|
| 131 |
line-height: 1.6;
|
| 132 |
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05);
|
|
|
|
| 133 |
}
|
| 134 |
|
| 135 |
.confidence-badge {
|
|
@@ -163,120 +186,172 @@
|
|
| 163 |
}
|
| 164 |
|
| 165 |
@keyframes pulse {
|
| 166 |
-
0% {
|
| 167 |
-
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
}
|
| 170 |
</style>
|
| 171 |
</head>
|
| 172 |
|
| 173 |
<body>
|
| 174 |
-
<
|
| 175 |
-
<
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
<
|
| 181 |
-
|
| 182 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
</div>
|
| 184 |
-
<div id="uploadStatus" class="status"></div>
|
| 185 |
-
</div>
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
| 193 |
</div>
|
| 194 |
-
</div>
|
| 195 |
|
| 196 |
-
|
| 197 |
|
| 198 |
-
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
-
|
| 201 |
-
|
| 202 |
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
document.getElementById("uploadBtn").disabled = state;
|
| 208 |
-
}
|
| 209 |
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
}
|
| 216 |
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
-
|
| 222 |
-
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
|
|
|
|
|
|
| 231 |
}
|
| 232 |
-
setBusy(false);
|
| 233 |
-
}
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
|
| 251 |
-
|
| 252 |
|
| 253 |
-
|
| 254 |
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
|
| 267 |
-
|
| 268 |
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
|
| 273 |
-
|
| 274 |
-
|
| 275 |
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
</script>
|
| 281 |
</body>
|
|
|
|
| 282 |
</html>
|
|
|
|
| 1 |
<!DOCTYPE html>
|
| 2 |
<html lang="en">
|
| 3 |
+
|
| 4 |
<head>
|
| 5 |
<meta charset="UTF-8" />
|
| 6 |
<title>Gemini RAG Assistant</title>
|
|
|
|
| 11 |
<style>
|
| 12 |
:root {
|
| 13 |
--bg: radial-gradient(1200px 600px at top, #e0e7ff 0%, #f8fafc 60%);
|
| 14 |
+
--card: rgba(255, 255, 255, 0.9);
|
| 15 |
+
--border: rgba(15, 23, 42, 0.08);
|
| 16 |
--primary: #4f46e5;
|
| 17 |
--secondary: #0ea5e9;
|
| 18 |
--text: #0f172a;
|
|
|
|
| 21 |
--success: #16a34a;
|
| 22 |
}
|
| 23 |
|
| 24 |
+
[data-theme="dark"] {
|
| 25 |
+
--bg: radial-gradient(1200px 600px at top, #1e1b4b 0%, #0f172a 60%);
|
| 26 |
+
--card: rgba(30, 41, 59, 0.9);
|
| 27 |
+
--border: rgba(148, 163, 184, 0.1);
|
| 28 |
+
--primary: #818cf8;
|
| 29 |
+
--secondary: #38bdf8;
|
| 30 |
+
--text: #f1f5f9;
|
| 31 |
+
--muted: #94a3b8;
|
| 32 |
+
--error: #f87171;
|
| 33 |
+
--success: #4ade80;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
* {
|
| 37 |
+
box-sizing: border-box;
|
| 38 |
+
font-family: Inter, sans-serif;
|
| 39 |
+
}
|
| 40 |
|
| 41 |
body {
|
| 42 |
margin: 0;
|
|
|
|
| 46 |
justify-content: center;
|
| 47 |
padding: 40px 16px;
|
| 48 |
color: var(--text);
|
| 49 |
+
transition: background 0.3s ease, color 0.3s ease;
|
| 50 |
}
|
| 51 |
|
| 52 |
.container {
|
|
|
|
| 57 |
border-radius: 24px;
|
| 58 |
padding: 36px;
|
| 59 |
border: 1px solid var(--border);
|
| 60 |
+
box-shadow: 0 40px 120px rgba(15, 23, 42, .15);
|
| 61 |
}
|
| 62 |
|
| 63 |
h1 {
|
|
|
|
| 65 |
margin: 0;
|
| 66 |
font-weight: 700;
|
| 67 |
background: linear-gradient(135deg, #4f46e5, #06b6d4);
|
| 68 |
+
background-clip: text;
|
| 69 |
-webkit-background-clip: text;
|
| 70 |
-webkit-text-fill-color: transparent;
|
| 71 |
}
|
|
|
|
| 78 |
|
| 79 |
.card {
|
| 80 |
margin-top: 28px;
|
| 81 |
+
background: var(--card);
|
| 82 |
border-radius: 18px;
|
| 83 |
padding: 24px;
|
| 84 |
border: 1px solid var(--border);
|
|
|
|
| 90 |
font-size: 1.1rem;
|
| 91 |
}
|
| 92 |
|
| 93 |
+
input[type="file"],
|
| 94 |
+
textarea {
|
| 95 |
width: 100%;
|
| 96 |
padding: 14px;
|
| 97 |
border-radius: 14px;
|
| 98 |
border: 1px solid var(--border);
|
| 99 |
font-size: 0.95rem;
|
| 100 |
+
background: var(--card);
|
| 101 |
+
color: var(--text);
|
| 102 |
}
|
| 103 |
|
| 104 |
textarea {
|
|
|
|
| 124 |
transition: all .2s ease;
|
| 125 |
}
|
| 126 |
|
| 127 |
+
button.secondary {
|
| 128 |
+
background: var(--secondary);
|
| 129 |
+
}
|
| 130 |
|
| 131 |
button:disabled {
|
| 132 |
opacity: .5;
|
|
|
|
| 135 |
|
| 136 |
button:hover:not(:disabled) {
|
| 137 |
transform: translateY(-1px);
|
| 138 |
+
box-shadow: 0 4px 12px rgba(79, 70, 229, .2);
|
| 139 |
}
|
| 140 |
|
| 141 |
.status {
|
|
|
|
| 148 |
margin-top: 24px;
|
| 149 |
padding: 22px;
|
| 150 |
border-radius: 16px;
|
| 151 |
+
background: var(--card);
|
| 152 |
border: 1px solid var(--border);
|
| 153 |
line-height: 1.6;
|
| 154 |
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05);
|
| 155 |
+
color: var(--text);
|
| 156 |
}
|
| 157 |
|
| 158 |
.confidence-badge {
|
|
|
|
| 186 |
}
|
| 187 |
|
| 188 |
@keyframes pulse {
|
| 189 |
+
0% {
|
| 190 |
+
opacity: .4
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
50% {
|
| 194 |
+
opacity: 1
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
100% {
|
| 198 |
+
opacity: .4
|
| 199 |
+
}
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
.theme-toggle {
|
| 203 |
+
position: fixed;
|
| 204 |
+
top: 20px;
|
| 205 |
+
right: 20px;
|
| 206 |
+
background: var(--card);
|
| 207 |
+
border: 1px solid var(--border);
|
| 208 |
+
border-radius: 12px;
|
| 209 |
+
padding: 10px;
|
| 210 |
+
cursor: pointer;
|
| 211 |
+
font-size: 1.4rem;
|
| 212 |
+
transition: transform 0.2s ease;
|
| 213 |
+
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
.theme-toggle:hover {
|
| 217 |
+
transform: scale(1.1);
|
| 218 |
}
|
| 219 |
</style>
|
| 220 |
</head>
|
| 221 |
|
| 222 |
<body>
|
| 223 |
+
<button class="theme-toggle" onclick="toggleTheme()" title="Toggle dark mode">🌙</button>
|
| 224 |
+
<div class="container">
|
| 225 |
+
<h1>Gemini RAG Assistant</h1>
|
| 226 |
+
<div class="subtitle">Upload documents · Ask questions · Get grounded answers · <a href="/frontend/analytics.html"
|
| 227 |
+
style="color: var(--primary); text-decoration: none; font-weight: 600;">📊 Analytics</a></div>
|
| 228 |
+
|
| 229 |
+
<div class="card">
|
| 230 |
+
<h3>1. Upload Knowledge</h3>
|
| 231 |
+
<input type="file" id="files" multiple accept=".pdf,.txt" />
|
| 232 |
+
<div class="row">
|
| 233 |
+
<button id="uploadBtn" onclick="upload()">Upload & Index Files</button>
|
| 234 |
+
</div>
|
| 235 |
+
<div id="uploadStatus" class="status"></div>
|
| 236 |
</div>
|
|
|
|
|
|
|
| 237 |
|
| 238 |
+
<div class="card">
|
| 239 |
+
<h3>2. Ask or Summarize</h3>
|
| 240 |
+
<textarea id="question" placeholder="E.g., 'What are the main risks?' or 'Summarize the document'"></textarea>
|
| 241 |
+
<div class="row">
|
| 242 |
+
<button id="askBtn" onclick="ask()">Ask Question</button>
|
| 243 |
+
<button class="secondary" id="sumBtn" onclick="summarize()">Generate Summary</button>
|
| 244 |
+
</div>
|
| 245 |
</div>
|
|
|
|
| 246 |
|
| 247 |
+
<div id="answerBox" class="answer" style="display:none;"></div>
|
| 248 |
|
| 249 |
+
</div>
|
| 250 |
+
|
| 251 |
+
<script>
|
| 252 |
+
// ===== THEME TOGGLE =====
|
| 253 |
+
function toggleTheme() {
|
| 254 |
+
const html = document.documentElement;
|
| 255 |
+
const currentTheme = html.getAttribute('data-theme');
|
| 256 |
+
const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
|
| 257 |
|
| 258 |
+
html.setAttribute('data-theme', newTheme);
|
| 259 |
+
localStorage.setItem('theme', newTheme);
|
| 260 |
|
| 261 |
+
// Update button icon
|
| 262 |
+
const btn = document.querySelector('.theme-toggle');
|
| 263 |
+
btn.textContent = newTheme === 'dark' ? '☀️' : '🌙';
|
| 264 |
+
}
|
|
|
|
|
|
|
| 265 |
|
| 266 |
+
// Load saved theme on page load
|
| 267 |
+
(function () {
|
| 268 |
+
const savedTheme = localStorage.getItem('theme') || 'light';
|
| 269 |
+
document.documentElement.setAttribute('data-theme', savedTheme);
|
| 270 |
+
const btn = document.querySelector('.theme-toggle');
|
| 271 |
+
if (btn) btn.textContent = savedTheme === 'dark' ? '☀️' : '🌙';
|
| 272 |
+
})();
|
| 273 |
+
|
| 274 |
+
// ===== APP LOGIC =====
|
| 275 |
+
let busy = false;
|
| 276 |
+
|
| 277 |
+
function setBusy(state) {
|
| 278 |
+
busy = state;
|
| 279 |
+
document.getElementById("askBtn").disabled = state;
|
| 280 |
+
document.getElementById("sumBtn").disabled = state;
|
| 281 |
+
document.getElementById("uploadBtn").disabled = state;
|
| 282 |
}
|
| 283 |
|
| 284 |
+
async function upload() {
|
| 285 |
+
const files = document.getElementById("files").files;
|
| 286 |
+
if (!files.length) {
|
| 287 |
+
alert("Please select files first.");
|
| 288 |
+
return;
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
setBusy(true);
|
| 292 |
+
const statusDiv = document.getElementById("uploadStatus");
|
| 293 |
+
statusDiv.innerText = "Indexing documents... this may take a moment.";
|
| 294 |
|
| 295 |
+
const fd = new FormData();
|
| 296 |
+
for (let f of files) fd.append("files", f);
|
| 297 |
|
| 298 |
+
try {
|
| 299 |
+
const res = await fetch("/upload", { method: "POST", body: fd });
|
| 300 |
+
if (!res.ok) throw new Error("Upload failed");
|
| 301 |
+
const data = await res.json();
|
| 302 |
+
statusDiv.innerText = data.message || "Done ✅";
|
| 303 |
+
} catch (e) {
|
| 304 |
+
statusDiv.innerText = "Error uploading files.";
|
| 305 |
+
}
|
| 306 |
+
setBusy(false);
|
| 307 |
}
|
|
|
|
|
|
|
| 308 |
|
| 309 |
+
async function ask() {
|
| 310 |
+
const q = document.getElementById("question").value.trim();
|
| 311 |
+
if (!q) return;
|
| 312 |
|
| 313 |
+
setBusy(true);
|
| 314 |
+
const box = document.getElementById("answerBox");
|
| 315 |
+
box.style.display = "block";
|
| 316 |
+
box.innerHTML = "<span class='loader'>Thinking...</span>";
|
| 317 |
|
| 318 |
+
try {
|
| 319 |
+
const res = await fetch("/ask", {
|
| 320 |
+
method: "POST",
|
| 321 |
+
headers: { "Content-Type": "application/json" },
|
| 322 |
+
body: JSON.stringify({ prompt: q })
|
| 323 |
+
});
|
| 324 |
|
| 325 |
+
const data = await res.json();
|
| 326 |
|
| 327 |
+
let html = `<div><strong>Answer:</strong><br>${data.answer.replace(/\n/g, '<br>')}</div>`;
|
| 328 |
|
| 329 |
+
if (data.confidence > 0) {
|
| 330 |
+
html += `<div class="confidence-badge">Confidence: ${(data.confidence * 100).toFixed(0)}%</div>`;
|
| 331 |
+
}
|
| 332 |
|
| 333 |
+
if (data.citations && data.citations.length > 0) {
|
| 334 |
+
html += `<div class="citations"><strong>Sources:</strong><ul>`;
|
| 335 |
+
data.citations.forEach(c => {
|
| 336 |
+
html += `<li>${c.source} (Page ${c.page})</li>`;
|
| 337 |
+
});
|
| 338 |
+
html += `</ul></div>`;
|
| 339 |
+
}
|
| 340 |
|
| 341 |
+
box.innerHTML = html;
|
| 342 |
|
| 343 |
+
} catch (e) {
|
| 344 |
+
box.innerText = "⚠️ Error communicating with the server.";
|
| 345 |
+
}
|
| 346 |
|
| 347 |
+
setBusy(false);
|
| 348 |
+
}
|
| 349 |
|
| 350 |
+
function summarize() {
|
| 351 |
+
document.getElementById("question").value = "Summarize the uploaded documents";
|
| 352 |
+
ask();
|
| 353 |
+
}
|
| 354 |
+
</script>
|
| 355 |
</body>
|
| 356 |
+
|
| 357 |
</html>
|
main.py
CHANGED
|
@@ -9,6 +9,8 @@ from dotenv import load_dotenv
|
|
| 9 |
import google.generativeai as genai
|
| 10 |
|
| 11 |
from rag_store import ingest_documents, search_knowledge, get_all_chunks, clear_database
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# =========================================================
|
| 14 |
# ENV + MODEL SETUP
|
|
@@ -19,6 +21,11 @@ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
|
| 19 |
MODEL_NAME = "gemini-2.5-flash"
|
| 20 |
USE_MOCK = False # Set to False to use real API
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# =========================================================
|
| 23 |
# APP
|
| 24 |
# =========================================================
|
|
@@ -53,12 +60,17 @@ def serve_ui():
|
|
| 53 |
with open("frontend/index.html", "r", encoding="utf-8") as f:
|
| 54 |
return f.read()
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
# ---------------------------------------------------------
|
| 57 |
# UPLOAD
|
| 58 |
# ---------------------------------------------------------
|
| 59 |
@app.post("/upload")
|
| 60 |
async def upload(files: list[UploadFile] = File(...)):
|
| 61 |
-
# 1. VALIDATION:
|
| 62 |
for file in files:
|
| 63 |
ext = file.filename.split(".")[-1].lower()
|
| 64 |
if ext not in ["pdf", "txt"]:
|
|
@@ -66,6 +78,19 @@ async def upload(files: list[UploadFile] = File(...)):
|
|
| 66 |
status_code=400,
|
| 67 |
content={"error": f"Invalid file type: '{file.filename}'. Only .pdf and .txt files are allowed."}
|
| 68 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
try:
|
| 71 |
# 2. CLEAR CONTEXT: Start fresh for every upload session
|
|
@@ -126,13 +151,31 @@ async def ask(data: PromptRequest):
|
|
| 126 |
return response
|
| 127 |
except Exception as e:
|
| 128 |
err_str = str(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
if "429" in err_str:
|
| 130 |
if attempt < retries:
|
| 131 |
wait_time = base_delay * (2 ** attempt)
|
| 132 |
print(f"DEBUG: 429 Rate limit hit. Retrying in {wait_time}s...")
|
| 133 |
pytime.sleep(wait_time)
|
| 134 |
continue
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
if is_summary:
|
| 138 |
chunks = get_all_chunks(limit=80)
|
|
@@ -180,10 +223,18 @@ Content:
|
|
| 180 |
answer_cache[key] = (now, response)
|
| 181 |
return response
|
| 182 |
|
| 183 |
-
except
|
|
|
|
| 184 |
print(f"Summary failed: {e}")
|
| 185 |
return JSONResponse(status_code=200, content={
|
| 186 |
-
"answer":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
"confidence": 0.0,
|
| 188 |
"citations": []
|
| 189 |
})
|
|
@@ -199,6 +250,14 @@ Content:
|
|
| 199 |
"confidence": 0.0,
|
| 200 |
"citations": []
|
| 201 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
answer_cache[key] = (now, response)
|
| 203 |
return response
|
| 204 |
|
|
@@ -219,8 +278,29 @@ Context:
|
|
| 219 |
Question:
|
| 220 |
{prompt_text}
|
| 221 |
"""
|
| 222 |
-
llm =
|
| 223 |
-
answer_text =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
# Fix Fake Confidence: If the model says "I don't know", confidence should be 0.
|
| 226 |
confidence = round(min(1.0, len(results) / 5), 2)
|
|
@@ -236,5 +316,14 @@ Question:
|
|
| 236 |
}.values())
|
| 237 |
}
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
answer_cache[key] = (now, response)
|
| 240 |
return response
|
|
|
|
| 9 |
import google.generativeai as genai
|
| 10 |
|
| 11 |
from rag_store import ingest_documents, search_knowledge, get_all_chunks, clear_database
|
| 12 |
+
from eval_logger import log_eval
|
| 13 |
+
from analytics import get_analytics
|
| 14 |
|
| 15 |
# =========================================================
|
| 16 |
# ENV + MODEL SETUP
|
|
|
|
| 21 |
MODEL_NAME = "gemini-2.5-flash"
|
| 22 |
USE_MOCK = False # Set to False to use real API
|
| 23 |
|
| 24 |
+
# =========================================================
|
| 25 |
+
# FILE UPLOAD LIMITS
|
| 26 |
+
# =========================================================
|
| 27 |
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
| 28 |
+
|
| 29 |
# =========================================================
|
| 30 |
# APP
|
| 31 |
# =========================================================
|
|
|
|
| 60 |
with open("frontend/index.html", "r", encoding="utf-8") as f:
|
| 61 |
return f.read()
|
| 62 |
|
| 63 |
+
@app.get("/analytics")
|
| 64 |
+
def analytics():
|
| 65 |
+
"""Return analytics data from evaluation logs."""
|
| 66 |
+
return get_analytics()
|
| 67 |
+
|
| 68 |
# ---------------------------------------------------------
|
| 69 |
# UPLOAD
|
| 70 |
# ---------------------------------------------------------
|
| 71 |
@app.post("/upload")
|
| 72 |
async def upload(files: list[UploadFile] = File(...)):
|
| 73 |
+
# 1. VALIDATION: File Type and Size Check
|
| 74 |
for file in files:
|
| 75 |
ext = file.filename.split(".")[-1].lower()
|
| 76 |
if ext not in ["pdf", "txt"]:
|
|
|
|
| 78 |
status_code=400,
|
| 79 |
content={"error": f"Invalid file type: '{file.filename}'. Only .pdf and .txt files are allowed."}
|
| 80 |
)
|
| 81 |
+
|
| 82 |
+
# Check file size
|
| 83 |
+
file.file.seek(0, 2) # Seek to end
|
| 84 |
+
file_size = file.file.tell()
|
| 85 |
+
file.file.seek(0) # Reset to beginning
|
| 86 |
+
|
| 87 |
+
if file_size > MAX_FILE_SIZE:
|
| 88 |
+
size_mb = file_size / (1024 * 1024)
|
| 89 |
+
max_mb = MAX_FILE_SIZE / (1024 * 1024)
|
| 90 |
+
return JSONResponse(
|
| 91 |
+
status_code=413,
|
| 92 |
+
content={"error": f"File '{file.filename}' is too large ({size_mb:.1f} MB). Maximum size is {max_mb:.0f} MB."}
|
| 93 |
+
)
|
| 94 |
|
| 95 |
try:
|
| 96 |
# 2. CLEAR CONTEXT: Start fresh for every upload session
|
|
|
|
| 151 |
return response
|
| 152 |
except Exception as e:
|
| 153 |
err_str = str(e)
|
| 154 |
+
|
| 155 |
+
# API Key Issues
|
| 156 |
+
if "API_KEY" in err_str or "invalid" in err_str.lower() and "key" in err_str.lower():
|
| 157 |
+
raise ValueError("Invalid API key. Please check your GEMINI_API_KEY in the .env file.")
|
| 158 |
+
|
| 159 |
+
# Quota Exhausted
|
| 160 |
+
if "quota" in err_str.lower() or "limit" in err_str.lower():
|
| 161 |
+
raise ValueError("API quota exhausted. Please try again later or upgrade your API plan.")
|
| 162 |
+
|
| 163 |
+
# Rate Limiting (429)
|
| 164 |
if "429" in err_str:
|
| 165 |
if attempt < retries:
|
| 166 |
wait_time = base_delay * (2 ** attempt)
|
| 167 |
print(f"DEBUG: 429 Rate limit hit. Retrying in {wait_time}s...")
|
| 168 |
pytime.sleep(wait_time)
|
| 169 |
continue
|
| 170 |
+
else:
|
| 171 |
+
raise ValueError("Rate limit exceeded. Please try again in a few minutes.")
|
| 172 |
+
|
| 173 |
+
# Safety Filters
|
| 174 |
+
if "safety" in err_str.lower() or "blocked" in err_str.lower():
|
| 175 |
+
raise ValueError("Content was blocked by safety filters. Please rephrase your question.")
|
| 176 |
+
|
| 177 |
+
# Generic error
|
| 178 |
+
raise ValueError(f"LLM API error: {err_str}")
|
| 179 |
|
| 180 |
if is_summary:
|
| 181 |
chunks = get_all_chunks(limit=80)
|
|
|
|
| 223 |
answer_cache[key] = (now, response)
|
| 224 |
return response
|
| 225 |
|
| 226 |
+
except ValueError as e:
|
| 227 |
+
# User-friendly error from generate_safe
|
| 228 |
print(f"Summary failed: {e}")
|
| 229 |
return JSONResponse(status_code=200, content={
|
| 230 |
+
"answer": str(e),
|
| 231 |
+
"confidence": 0.0,
|
| 232 |
+
"citations": []
|
| 233 |
+
})
|
| 234 |
+
except Exception as e:
|
| 235 |
+
print(f"Summary failed: {e}")
|
| 236 |
+
return JSONResponse(status_code=500, content={
|
| 237 |
+
"answer": f"An unexpected error occurred: {str(e)}",
|
| 238 |
"confidence": 0.0,
|
| 239 |
"citations": []
|
| 240 |
})
|
|
|
|
| 250 |
"confidence": 0.0,
|
| 251 |
"citations": []
|
| 252 |
}
|
| 253 |
+
|
| 254 |
+
log_eval(
|
| 255 |
+
query=prompt_text,
|
| 256 |
+
retrieved_count=0,
|
| 257 |
+
confidence=0.0,
|
| 258 |
+
answer_known=False
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
answer_cache[key] = (now, response)
|
| 262 |
return response
|
| 263 |
|
|
|
|
| 278 |
Question:
|
| 279 |
{prompt_text}
|
| 280 |
"""
|
| 281 |
+
llm = None
|
| 282 |
+
answer_text = ""
|
| 283 |
+
|
| 284 |
+
try:
|
| 285 |
+
llm = model.generate_content(prompt)
|
| 286 |
+
answer_text = llm.text
|
| 287 |
+
except ValueError as e:
|
| 288 |
+
# User-friendly error from API
|
| 289 |
+
response = {
|
| 290 |
+
"answer": str(e),
|
| 291 |
+
"confidence": 0.0,
|
| 292 |
+
"citations": []
|
| 293 |
+
}
|
| 294 |
+
answer_cache[key] = (now, response)
|
| 295 |
+
return response
|
| 296 |
+
except Exception as e:
|
| 297 |
+
# Unexpected error
|
| 298 |
+
response = {
|
| 299 |
+
"answer": f"An unexpected error occurred: {str(e)}",
|
| 300 |
+
"confidence": 0.0,
|
| 301 |
+
"citations": []
|
| 302 |
+
}
|
| 303 |
+
return JSONResponse(status_code=500, content=response)
|
| 304 |
|
| 305 |
# Fix Fake Confidence: If the model says "I don't know", confidence should be 0.
|
| 306 |
confidence = round(min(1.0, len(results) / 5), 2)
|
|
|
|
| 316 |
}.values())
|
| 317 |
}
|
| 318 |
|
| 319 |
+
answer_known = "i don't know" not in answer_text.lower()
|
| 320 |
+
|
| 321 |
+
log_eval(
|
| 322 |
+
query=prompt_text,
|
| 323 |
+
retrieved_count=len(results),
|
| 324 |
+
confidence=confidence,
|
| 325 |
+
answer_known=answer_known
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
answer_cache[key] = (now, response)
|
| 329 |
return response
|
rag_eval_logs.jsonl
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"timestamp": 1767711001.7256067, "query": "how much did student got the scholarship?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 2 |
+
{"timestamp": 1767711040.2298725, "query": "what is program duration and fees?", "retrieved_count": 5, "confidence": 0.0, "answer_known": false}
|
| 3 |
+
{"timestamp": 1767711058.4562092, "query": "what is program duration?", "retrieved_count": 5, "confidence": 0.0, "answer_known": false}
|
| 4 |
+
{"timestamp": 1767711112.7273183, "query": "what is university name?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 5 |
+
{"timestamp": 1767711173.2461476, "query": "what is university name and tution fees?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 6 |
+
{"timestamp": 1767711339.3448784, "query": "what is the scholarship name?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 7 |
+
{"timestamp": 1767711360.8415213, "query": "how much for OSHC?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 8 |
+
{"timestamp": 1767711421.3609428, "query": "what are the conditions to accept the offer?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 9 |
+
{"timestamp": 1767711528.7769852, "query": "is this Sydney university?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 10 |
+
{"timestamp": 1767711553.6641674, "query": "what is the father name?", "retrieved_count": 1, "confidence": 0.0, "answer_known": false}
|
| 11 |
+
{"timestamp": 1767711646.590812, "query": "tell me about refund process?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 12 |
+
{"timestamp": 1767711707.6674147, "query": "when is orientation and enrollment?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 13 |
+
{"timestamp": 1767711808.9429202, "query": "is it good university?", "retrieved_count": 5, "confidence": 0.0, "answer_known": false}
|
| 14 |
+
{"timestamp": 1767714229.559012, "query": "what is candidate name and college name?", "retrieved_count": 1, "confidence": 0.0, "answer_known": false}
|
| 15 |
+
{"timestamp": 1767774219.903037, "query": "what is university name?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 16 |
+
{"timestamp": 1767774233.0324135, "query": "what are my risks?", "retrieved_count": 0, "confidence": 0.0, "answer_known": false}
|
| 17 |
+
{"timestamp": 1767774272.1751444, "query": "what is course name?and where is monash university?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 18 |
+
{"timestamp": 1767775699.7483144, "query": "Summarize the uploaded documents", "retrieved_count": 0, "confidence": 0.0, "answer_known": false}
|
| 19 |
+
{"timestamp": 1767775793.0641322, "query": "Summarize the uploaded documents", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 20 |
+
{"timestamp": 1767775929.047869, "query": "what is university name?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 21 |
+
{"timestamp": 1767776180.9555495, "query": "what are the visa conditions?", "retrieved_count": 5, "confidence": 0.0, "answer_known": false}
|
| 22 |
+
{"timestamp": 1767776250.0441537, "query": "tell me about program information?", "retrieved_count": 5, "confidence": 1.0, "answer_known": true}
|
| 23 |
+
{"timestamp": 1767777566.4153016, "query": "what was the role ?", "retrieved_count": 3, "confidence": 0.6, "answer_known": true}
|
rag_store.py
CHANGED
|
@@ -1,19 +1,14 @@
|
|
| 1 |
import faiss
|
| 2 |
-
import numpy as np
|
| 3 |
import os
|
| 4 |
import pickle
|
| 5 |
from pypdf import PdfReader
|
| 6 |
-
from sentence_transformers import SentenceTransformer
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
#
|
| 10 |
-
#
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
metadata = []
|
| 14 |
-
|
| 15 |
-
# Using a lightweight, high-performance embedding model
|
| 16 |
-
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
| 17 |
|
| 18 |
CHUNK_SIZE = 800
|
| 19 |
CHUNK_OVERLAP = 200
|
|
@@ -21,166 +16,177 @@ CHUNK_OVERLAP = 200
|
|
| 21 |
DB_FILE_INDEX = "vector.index"
|
| 22 |
DB_FILE_META = "metadata.pkl"
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
#
|
| 26 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def chunk_text(text):
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
return chunks
|
| 36 |
|
|
|
|
| 37 |
def save_db():
|
| 38 |
-
global index, documents, metadata
|
| 39 |
if index:
|
| 40 |
faiss.write_index(index, DB_FILE_INDEX)
|
| 41 |
if documents:
|
| 42 |
with open(DB_FILE_META, "wb") as f:
|
| 43 |
pickle.dump({"documents": documents, "metadata": metadata}, f)
|
| 44 |
-
|
| 45 |
|
| 46 |
def load_db():
|
| 47 |
global index, documents, metadata
|
| 48 |
if os.path.exists(DB_FILE_INDEX) and os.path.exists(DB_FILE_META):
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
except Exception as e:
|
| 57 |
-
print(f"DEBUG: Failed to load DB: {e}")
|
| 58 |
-
index = None
|
| 59 |
-
documents = []
|
| 60 |
-
metadata = []
|
| 61 |
-
else:
|
| 62 |
-
print("DEBUG: No existing DB found. Starting fresh.")
|
| 63 |
|
| 64 |
-
# Auto-load on startup
|
| 65 |
load_db()
|
| 66 |
|
|
|
|
| 67 |
def clear_database():
|
| 68 |
global index, documents, metadata
|
| 69 |
index = None
|
| 70 |
documents = []
|
| 71 |
metadata = []
|
| 72 |
-
|
| 73 |
-
# Remove persistence files if they exist
|
| 74 |
if os.path.exists(DB_FILE_INDEX):
|
| 75 |
os.remove(DB_FILE_INDEX)
|
| 76 |
if os.path.exists(DB_FILE_META):
|
| 77 |
os.remove(DB_FILE_META)
|
| 78 |
-
|
| 79 |
-
print("DEBUG: Database cleared.")
|
| 80 |
|
| 81 |
-
|
| 82 |
-
#
|
| 83 |
-
#
|
|
|
|
| 84 |
def ingest_documents(files):
|
| 85 |
global index, documents, metadata
|
| 86 |
|
| 87 |
-
texts = []
|
| 88 |
-
meta = []
|
| 89 |
|
| 90 |
for file in files:
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
if filename.endswith(".pdf"):
|
| 95 |
reader = PdfReader(file.file)
|
| 96 |
for i, page in enumerate(reader.pages):
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
texts.append(chunk)
|
| 101 |
-
meta.append({"source":
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
content = file.file.read().decode("utf-8")
|
| 106 |
for chunk in chunk_text(content):
|
| 107 |
texts.append(chunk)
|
| 108 |
-
meta.append({"source":
|
| 109 |
|
| 110 |
-
|
| 111 |
-
total_length = sum(len(t) for t in texts)
|
| 112 |
-
if total_length < 50:
|
| 113 |
raise ValueError(
|
| 114 |
-
"
|
| 115 |
-
"If this is a PDF,
|
| 116 |
-
"Please use a text-selectable PDF or a .txt file."
|
| 117 |
)
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
# Simpler to just ADD to the existing index.
|
| 125 |
-
|
| 126 |
-
new_embeddings = embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
|
| 127 |
|
| 128 |
if index is None:
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
documents.extend(texts)
|
| 135 |
metadata.extend(meta)
|
| 136 |
|
| 137 |
save_db()
|
| 138 |
-
|
| 139 |
return len(documents)
|
| 140 |
|
| 141 |
-
|
| 142 |
-
#
|
| 143 |
-
#
|
| 144 |
-
|
|
|
|
| 145 |
if index is None:
|
| 146 |
return []
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
results = []
|
| 155 |
-
print(f"DEBUG: Query: '{query}'")
|
| 156 |
for idx, score in zip(indices[0], scores[0]):
|
| 157 |
-
if idx == -1:
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
| 167 |
})
|
| 168 |
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
|
| 172 |
-
# Summary Retrieval (NO FILTER)
|
| 173 |
-
# -----------------------
|
| 174 |
-
def get_all_chunks(limit=50):
|
| 175 |
-
if not documents:
|
| 176 |
-
return []
|
| 177 |
|
| 178 |
-
results = []
|
| 179 |
-
# Return a sample of chunks for summarization
|
| 180 |
-
for text, meta in zip(documents[:limit], metadata[:limit]):
|
| 181 |
-
results.append({
|
| 182 |
-
"text": text,
|
| 183 |
-
"metadata": meta
|
| 184 |
-
})
|
| 185 |
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import faiss
|
|
|
|
| 2 |
import os
|
| 3 |
import pickle
|
| 4 |
from pypdf import PdfReader
|
| 5 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 6 |
|
| 7 |
+
# =========================================================
|
| 8 |
+
# CONFIG
|
| 9 |
+
# =========================================================
|
| 10 |
+
USE_HNSW = True
|
| 11 |
+
USE_RERANKER = True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
CHUNK_SIZE = 800
|
| 14 |
CHUNK_OVERLAP = 200
|
|
|
|
| 16 |
DB_FILE_INDEX = "vector.index"
|
| 17 |
DB_FILE_META = "metadata.pkl"
|
| 18 |
|
| 19 |
+
# =========================================================
|
| 20 |
+
# GLOBAL STATE
|
| 21 |
+
# =========================================================
|
| 22 |
+
index = None
|
| 23 |
+
documents = []
|
| 24 |
+
metadata = []
|
| 25 |
+
|
| 26 |
+
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
| 27 |
+
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
| 28 |
+
|
| 29 |
+
# =========================================================
|
| 30 |
+
# HELPERS
|
| 31 |
+
# =========================================================
|
| 32 |
def chunk_text(text):
|
| 33 |
+
import re
|
| 34 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 35 |
+
|
| 36 |
+
chunks, current = [], ""
|
| 37 |
+
for s in sentences:
|
| 38 |
+
if len(current) + len(s) > CHUNK_SIZE and current:
|
| 39 |
+
chunks.append(current.strip())
|
| 40 |
+
overlap = max(0, len(current) - CHUNK_OVERLAP)
|
| 41 |
+
current = current[overlap:] + " " + s
|
| 42 |
+
else:
|
| 43 |
+
current += " " + s if current else s
|
| 44 |
+
|
| 45 |
+
if current.strip():
|
| 46 |
+
chunks.append(current.strip())
|
| 47 |
return chunks
|
| 48 |
|
| 49 |
+
|
| 50 |
def save_db():
|
|
|
|
| 51 |
if index:
|
| 52 |
faiss.write_index(index, DB_FILE_INDEX)
|
| 53 |
if documents:
|
| 54 |
with open(DB_FILE_META, "wb") as f:
|
| 55 |
pickle.dump({"documents": documents, "metadata": metadata}, f)
|
| 56 |
+
|
| 57 |
|
| 58 |
def load_db():
|
| 59 |
global index, documents, metadata
|
| 60 |
if os.path.exists(DB_FILE_INDEX) and os.path.exists(DB_FILE_META):
|
| 61 |
+
index = faiss.read_index(DB_FILE_INDEX)
|
| 62 |
+
with open(DB_FILE_META, "rb") as f:
|
| 63 |
+
data = pickle.load(f)
|
| 64 |
+
documents = data["documents"]
|
| 65 |
+
metadata = data["metadata"]
|
| 66 |
+
print(f"DEBUG: Loaded {len(documents)} chunks")
|
| 67 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
|
|
|
| 69 |
load_db()
|
| 70 |
|
| 71 |
+
|
| 72 |
def clear_database():
|
| 73 |
global index, documents, metadata
|
| 74 |
index = None
|
| 75 |
documents = []
|
| 76 |
metadata = []
|
| 77 |
+
|
|
|
|
| 78 |
if os.path.exists(DB_FILE_INDEX):
|
| 79 |
os.remove(DB_FILE_INDEX)
|
| 80 |
if os.path.exists(DB_FILE_META):
|
| 81 |
os.remove(DB_FILE_META)
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
|
| 84 |
+
# =========================================================
|
| 85 |
+
# INGEST
|
| 86 |
+
# =========================================================
|
| 87 |
def ingest_documents(files):
|
| 88 |
global index, documents, metadata
|
| 89 |
|
| 90 |
+
texts, meta = [], []
|
|
|
|
| 91 |
|
| 92 |
for file in files:
|
| 93 |
+
name = file.filename
|
| 94 |
+
|
| 95 |
+
if name.endswith(".pdf"):
|
|
|
|
| 96 |
reader = PdfReader(file.file)
|
| 97 |
for i, page in enumerate(reader.pages):
|
| 98 |
+
try:
|
| 99 |
+
text = page.extract_text()
|
| 100 |
+
except Exception:
|
| 101 |
+
text = None
|
| 102 |
+
|
| 103 |
+
if text:
|
| 104 |
+
for chunk in chunk_text(text):
|
| 105 |
texts.append(chunk)
|
| 106 |
+
meta.append({"source": name, "page": i + 1})
|
| 107 |
+
|
| 108 |
+
elif name.endswith(".txt"):
|
| 109 |
+
content = file.file.read().decode("utf-8", errors="ignore")
|
|
|
|
| 110 |
for chunk in chunk_text(content):
|
| 111 |
texts.append(chunk)
|
| 112 |
+
meta.append({"source": name, "page": "N/A"})
|
| 113 |
|
| 114 |
+
if not texts:
|
|
|
|
|
|
|
| 115 |
raise ValueError(
|
| 116 |
+
"No readable text found. "
|
| 117 |
+
"If this is a scanned PDF, OCR is required."
|
|
|
|
| 118 |
)
|
| 119 |
|
| 120 |
+
embeddings = embedder.encode(
|
| 121 |
+
texts,
|
| 122 |
+
convert_to_numpy=True,
|
| 123 |
+
normalize_embeddings=True
|
| 124 |
+
)
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
if index is None:
|
| 127 |
+
dim = embeddings.shape[1]
|
| 128 |
+
if USE_HNSW:
|
| 129 |
+
index = faiss.IndexHNSWFlat(dim, 32)
|
| 130 |
+
index.hnsw.efConstruction = 200
|
| 131 |
+
index.hnsw.efSearch = 64
|
| 132 |
+
else:
|
| 133 |
+
index = faiss.IndexFlatIP(dim)
|
| 134 |
+
|
| 135 |
+
index.add(embeddings)
|
| 136 |
documents.extend(texts)
|
| 137 |
metadata.extend(meta)
|
| 138 |
|
| 139 |
save_db()
|
|
|
|
| 140 |
return len(documents)
|
| 141 |
|
| 142 |
+
|
| 143 |
+
# =========================================================
|
| 144 |
+
# SEARCH
|
| 145 |
+
# =========================================================
|
| 146 |
+
def search_knowledge(query, top_k=8, min_similarity=0.25):
|
| 147 |
if index is None:
|
| 148 |
return []
|
| 149 |
|
| 150 |
+
qvec = embedder.encode(
|
| 151 |
+
[query],
|
| 152 |
+
convert_to_numpy=True,
|
| 153 |
+
normalize_embeddings=True
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
scores, indices = index.search(qvec, top_k)
|
| 157 |
+
candidates = []
|
| 158 |
+
ql = query.lower()
|
| 159 |
|
|
|
|
|
|
|
| 160 |
for idx, score in zip(indices[0], scores[0]):
|
| 161 |
+
if idx == -1:
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
text = documents[idx]
|
| 165 |
+
meta = metadata[idx]
|
| 166 |
+
keyword_hits = sum(w in text.lower() for w in ql.split())
|
| 167 |
+
hybrid_score = float(score) + (0.05 * keyword_hits)
|
| 168 |
+
|
| 169 |
+
if hybrid_score >= min_similarity:
|
| 170 |
+
candidates.append({
|
| 171 |
+
"text": text,
|
| 172 |
+
"metadata": meta,
|
| 173 |
+
"hybrid_score": hybrid_score
|
| 174 |
})
|
| 175 |
|
| 176 |
+
if USE_RERANKER and candidates:
|
| 177 |
+
pairs = [(query, c["text"]) for c in candidates]
|
| 178 |
+
scores = reranker.predict(pairs)
|
| 179 |
+
for c, s in zip(candidates, scores):
|
| 180 |
+
c["rerank"] = float(s)
|
| 181 |
+
candidates.sort(key=lambda x: x["rerank"], reverse=True)
|
| 182 |
+
else:
|
| 183 |
+
candidates.sort(key=lambda x: x["hybrid_score"], reverse=True)
|
| 184 |
|
| 185 |
+
return candidates[:5]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
+
def get_all_chunks(limit=50):
|
| 189 |
+
return [
|
| 190 |
+
{"text": t, "metadata": m}
|
| 191 |
+
for t, m in zip(documents[:limit], metadata[:limit])
|
| 192 |
+
]
|
render.yaml
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
services:
|
| 2 |
-
- type: web
|
| 3 |
-
name: gemini-rag-fastapi
|
| 4 |
-
env: python
|
| 5 |
-
plan: free
|
| 6 |
-
buildCommand: |
|
| 7 |
-
pip install -r requirements.txt
|
| 8 |
-
startCommand: |
|
| 9 |
-
uvicorn main:app --host 0.0.0.0 --port 10000
|
| 10 |
-
envVars:
|
| 11 |
-
- key: GEMINI_API_KEY
|
| 12 |
-
sync: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|