Spaces:
Sleeping
Sleeping
File size: 11,813 Bytes
7005995 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 |
"""RAG Pipeline Tracker - Mock retrieval-augmented generation stages for visualization."""
import hashlib
import time
from typing import List, Dict, Any
import random
class RAGTracker:
"""Tracks and mocks RAG pipeline stages for educational visualization."""
def __init__(self):
self.stages: List[Dict[str, Any]] = []
self.start_time = time.time()
def track_query_encoding(self, query: str) -> Dict[str, Any]:
"""Stage 1: Encode query into embedding space."""
# Generate deterministic pseudo-embedding from query
query_hash = int(hashlib.md5(query.encode()).hexdigest()[:8], 16)
random.seed(query_hash)
# Mock 768-dim embedding (show first 10)
embedding = [round(random.uniform(-1.0, 1.0), 3) for _ in range(768)]
embedding_preview = embedding[:10]
stage_data = {
"stage": "query_encoding",
"query": query,
"embedding_dim": 768,
"embedding_preview": embedding_preview,
"encoding_method": "sentence-transformers (mock)",
"timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
}
self.stages.append(stage_data)
return stage_data
def track_retrieval(self, query: str, mode: str = "standard") -> Dict[str, Any]:
"""Stage 2: Semantic search over knowledge base."""
# Mock document retrieval with realistic-looking results
query_lower = query.lower()
# Generate contextually relevant mock documents
docs = self._generate_mock_documents(query_lower, mode)
# Simulate semantic similarity scores (deterministic based on query)
query_hash = int(hashlib.md5(query.encode()).hexdigest()[:8], 16)
random.seed(query_hash)
for doc in docs:
# Higher scores for better matches
base_score = random.uniform(0.75, 0.95)
doc["relevance_score"] = round(base_score, 3)
doc["retrieval_method"] = "Dense Passage Retrieval (DPR)"
# Sort by relevance
docs.sort(key=lambda x: x["relevance_score"], reverse=True)
stage_data = {
"stage": "retrieval",
"num_documents_searched": random.randint(50000, 500000),
"top_k_retrieved": len(docs),
"documents": docs,
"search_time_ms": round(random.uniform(8, 25), 2),
"timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
}
self.stages.append(stage_data)
return stage_data
def track_reranking(self, documents: List[Dict]) -> Dict[str, Any]:
"""Stage 3: Re-rank retrieved documents with cross-encoder."""
# Simulate re-ranking: slight score adjustments
reranked = []
for i, doc in enumerate(documents[:5]): # Only rerank top 5
old_score = doc.get("relevance_score", 0.8)
# Some docs improve, some decrease
adjustment = random.uniform(-0.08, 0.12)
new_score = min(0.99, max(0.60, old_score + adjustment))
reranked.append({
"title": doc["title"],
"snippet": doc["snippet"],
"old_score": old_score,
"new_score": round(new_score, 3),
"score_change": round(adjustment, 3),
"reranker": "cross-encoder/ms-marco-MiniLM (mock)"
})
# Sort by new score
reranked.sort(key=lambda x: x["new_score"], reverse=True)
stage_data = {
"stage": "reranking",
"reranked_documents": reranked,
"reranking_time_ms": round(random.uniform(15, 40), 2),
"timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
}
self.stages.append(stage_data)
return stage_data
def track_generation(self, query: str, context_docs: List[Dict], response: str) -> Dict[str, Any]:
"""Stage 4: Generate response with attribution."""
# Extract potential citations from top docs
citations = []
for i, doc in enumerate(context_docs[:3], 1):
citations.append({
"id": i,
"title": doc.get("title", "Source"),
"relevance": doc.get("new_score", doc.get("relevance_score", 0.8)),
"used": random.choice([True, True, False]) # Most are used
})
stage_data = {
"stage": "generation",
"context_length": sum(len(d.get("snippet", "")) for d in context_docs),
"num_context_docs": len(context_docs),
"response_length": len(response),
"citations": citations,
"generation_time_ms": round(random.uniform(200, 800), 2),
"timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
}
self.stages.append(stage_data)
return stage_data
def get_pipeline_summary(self) -> Dict[str, Any]:
"""Get complete pipeline visualization data."""
total_time = round((time.time() - self.start_time) * 1000, 2)
return {
"stages": self.stages,
"total_time_ms": total_time,
"pipeline_type": "RAG (Retrieval-Augmented Generation)",
"components": [
"Query Encoder (sentence-transformers)",
"Vector Database (FAISS/approximate NN)",
"Re-ranker (cross-encoder)",
"Generator (LLM with context)"
]
}
def _generate_mock_documents(self, query: str, mode: str) -> List[Dict[str, str]]:
"""Generate contextually relevant mock documents based on query keywords."""
# Database of mock document templates
doc_templates = {
"default": [
{
"title": "Introduction to {topic}",
"snippet": "This comprehensive guide covers the fundamentals of {topic}, including key concepts, practical applications, and real-world examples.",
"source": "Educational Resources Database",
"citations": random.randint(50, 500)
},
{
"title": "Advanced Concepts in {topic}",
"snippet": "Exploring advanced techniques and methodologies in {topic}, with detailed analysis of current research and best practices.",
"source": "Academic Journal Repository",
"citations": random.randint(100, 1000)
},
{
"title": "{topic}: A Practical Guide",
"snippet": "Step-by-step tutorial demonstrating how to apply {topic} concepts in real-world scenarios, with code examples and case studies.",
"source": "Technical Documentation",
"citations": random.randint(30, 200)
},
{
"title": "Understanding {topic} for Beginners",
"snippet": "Simplified introduction to {topic} designed for newcomers, breaking down complex ideas into digestible explanations.",
"source": "Learning Platform",
"citations": random.randint(20, 150)
},
{
"title": "Recent Advances in {topic}",
"snippet": "Survey of the latest developments and breakthrough research in {topic}, covering state-of-the-art techniques and future directions.",
"source": "Research Papers Archive",
"citations": random.randint(200, 2000)
}
],
"technical": [
{
"title": "Technical Deep-Dive: {topic}",
"snippet": "Detailed technical analysis of {topic} architecture, implementation details, performance characteristics, and optimization strategies.",
"source": "Technical Specifications",
"citations": random.randint(150, 800)
},
{
"title": "{topic} Implementation Reference",
"snippet": "Complete reference implementation with benchmarks, complexity analysis, and comparison with alternative approaches.",
"source": "Engineering Documentation",
"citations": random.randint(80, 400)
}
],
"code": [
{
"title": "{topic} Code Examples",
"snippet": "Annotated code samples demonstrating {topic} implementation patterns, with explanations of key design decisions and trade-offs.",
"source": "Code Repository",
"citations": random.randint(40, 300)
},
{
"title": "Building with {topic}: Tutorial",
"snippet": "Hands-on coding tutorial walking through {topic} implementation from scratch, including testing and debugging strategies.",
"source": "Developer Tutorials",
"citations": random.randint(60, 350)
}
]
}
# Extract topic from query (simple heuristic)
topic = self._extract_topic(query)
# Select templates based on mode
if mode == "technical":
templates = doc_templates["technical"] + doc_templates["default"][:3]
elif mode == "code":
templates = doc_templates["code"] + doc_templates["default"][:3]
else:
templates = doc_templates["default"]
# Fill templates with extracted topic
docs = []
for template in templates[:5]: # Top 5 results
docs.append({
"title": template["title"].format(topic=topic.title()),
"snippet": template["snippet"].format(topic=topic),
"source": template["source"],
"citations": template["citations"]
})
return docs
def _extract_topic(self, query: str) -> str:
"""Extract main topic from query (simple keyword extraction)."""
# Remove common question words
stopwords = ["what", "how", "why", "when", "where", "who", "explain", "describe", "tell", "me", "about", "the", "is", "are", "can", "you", "do", "does"]
words = query.lower().split()
filtered = [w.strip("?.,!") for w in words if w not in stopwords and len(w) > 2]
# Return multi-word topic or single word
if len(filtered) >= 2:
return " ".join(filtered[:3]) # Max 3 words
elif len(filtered) == 1:
return filtered[0]
else:
return "this concept"
def create_rag_pipeline(query: str, mode: str, response: str) -> Dict[str, Any]:
"""Create a complete RAG pipeline trace for visualization.
Args:
query: User's question
mode: Learning mode (affects document selection)
response: Generated response text
Returns:
Complete pipeline data with all stages
"""
tracker = RAGTracker()
# Stage 1: Query encoding
tracker.track_query_encoding(query)
# Stage 2: Retrieval
retrieval_data = tracker.track_retrieval(query, mode)
documents = retrieval_data["documents"]
# Stage 3: Re-ranking
reranking_data = tracker.track_reranking(documents)
reranked_docs = reranking_data["reranked_documents"]
# Stage 4: Generation with context
tracker.track_generation(query, reranked_docs, response)
return tracker.get_pipeline_summary()
|