File size: 11,813 Bytes
7005995
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
"""RAG Pipeline Tracker - Mock retrieval-augmented generation stages for visualization."""
import hashlib
import time
from typing import List, Dict, Any
import random


class RAGTracker:
    """Tracks and mocks RAG pipeline stages for educational visualization."""
    
    def __init__(self):
        self.stages: List[Dict[str, Any]] = []
        self.start_time = time.time()
    
    def track_query_encoding(self, query: str) -> Dict[str, Any]:
        """Stage 1: Encode query into embedding space."""
        # Generate deterministic pseudo-embedding from query
        query_hash = int(hashlib.md5(query.encode()).hexdigest()[:8], 16)
        random.seed(query_hash)
        
        # Mock 768-dim embedding (show first 10)
        embedding = [round(random.uniform(-1.0, 1.0), 3) for _ in range(768)]
        embedding_preview = embedding[:10]
        
        stage_data = {
            "stage": "query_encoding",
            "query": query,
            "embedding_dim": 768,
            "embedding_preview": embedding_preview,
            "encoding_method": "sentence-transformers (mock)",
            "timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
        }
        self.stages.append(stage_data)
        return stage_data
    
    def track_retrieval(self, query: str, mode: str = "standard") -> Dict[str, Any]:
        """Stage 2: Semantic search over knowledge base."""
        # Mock document retrieval with realistic-looking results
        query_lower = query.lower()
        
        # Generate contextually relevant mock documents
        docs = self._generate_mock_documents(query_lower, mode)
        
        # Simulate semantic similarity scores (deterministic based on query)
        query_hash = int(hashlib.md5(query.encode()).hexdigest()[:8], 16)
        random.seed(query_hash)
        
        for doc in docs:
            # Higher scores for better matches
            base_score = random.uniform(0.75, 0.95)
            doc["relevance_score"] = round(base_score, 3)
            doc["retrieval_method"] = "Dense Passage Retrieval (DPR)"
        
        # Sort by relevance
        docs.sort(key=lambda x: x["relevance_score"], reverse=True)
        
        stage_data = {
            "stage": "retrieval",
            "num_documents_searched": random.randint(50000, 500000),
            "top_k_retrieved": len(docs),
            "documents": docs,
            "search_time_ms": round(random.uniform(8, 25), 2),
            "timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
        }
        self.stages.append(stage_data)
        return stage_data
    
    def track_reranking(self, documents: List[Dict]) -> Dict[str, Any]:
        """Stage 3: Re-rank retrieved documents with cross-encoder."""
        # Simulate re-ranking: slight score adjustments
        reranked = []
        for i, doc in enumerate(documents[:5]):  # Only rerank top 5
            old_score = doc.get("relevance_score", 0.8)
            # Some docs improve, some decrease
            adjustment = random.uniform(-0.08, 0.12)
            new_score = min(0.99, max(0.60, old_score + adjustment))
            
            reranked.append({
                "title": doc["title"],
                "snippet": doc["snippet"],
                "old_score": old_score,
                "new_score": round(new_score, 3),
                "score_change": round(adjustment, 3),
                "reranker": "cross-encoder/ms-marco-MiniLM (mock)"
            })
        
        # Sort by new score
        reranked.sort(key=lambda x: x["new_score"], reverse=True)
        
        stage_data = {
            "stage": "reranking",
            "reranked_documents": reranked,
            "reranking_time_ms": round(random.uniform(15, 40), 2),
            "timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
        }
        self.stages.append(stage_data)
        return stage_data
    
    def track_generation(self, query: str, context_docs: List[Dict], response: str) -> Dict[str, Any]:
        """Stage 4: Generate response with attribution."""
        # Extract potential citations from top docs
        citations = []
        for i, doc in enumerate(context_docs[:3], 1):
            citations.append({
                "id": i,
                "title": doc.get("title", "Source"),
                "relevance": doc.get("new_score", doc.get("relevance_score", 0.8)),
                "used": random.choice([True, True, False])  # Most are used
            })
        
        stage_data = {
            "stage": "generation",
            "context_length": sum(len(d.get("snippet", "")) for d in context_docs),
            "num_context_docs": len(context_docs),
            "response_length": len(response),
            "citations": citations,
            "generation_time_ms": round(random.uniform(200, 800), 2),
            "timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
        }
        self.stages.append(stage_data)
        return stage_data
    
    def get_pipeline_summary(self) -> Dict[str, Any]:
        """Get complete pipeline visualization data."""
        total_time = round((time.time() - self.start_time) * 1000, 2)
        return {
            "stages": self.stages,
            "total_time_ms": total_time,
            "pipeline_type": "RAG (Retrieval-Augmented Generation)",
            "components": [
                "Query Encoder (sentence-transformers)",
                "Vector Database (FAISS/approximate NN)",
                "Re-ranker (cross-encoder)",
                "Generator (LLM with context)"
            ]
        }
    
    def _generate_mock_documents(self, query: str, mode: str) -> List[Dict[str, str]]:
        """Generate contextually relevant mock documents based on query keywords."""
        # Database of mock document templates
        doc_templates = {
            "default": [
                {
                    "title": "Introduction to {topic}",
                    "snippet": "This comprehensive guide covers the fundamentals of {topic}, including key concepts, practical applications, and real-world examples.",
                    "source": "Educational Resources Database",
                    "citations": random.randint(50, 500)
                },
                {
                    "title": "Advanced Concepts in {topic}",
                    "snippet": "Exploring advanced techniques and methodologies in {topic}, with detailed analysis of current research and best practices.",
                    "source": "Academic Journal Repository",
                    "citations": random.randint(100, 1000)
                },
                {
                    "title": "{topic}: A Practical Guide",
                    "snippet": "Step-by-step tutorial demonstrating how to apply {topic} concepts in real-world scenarios, with code examples and case studies.",
                    "source": "Technical Documentation",
                    "citations": random.randint(30, 200)
                },
                {
                    "title": "Understanding {topic} for Beginners",
                    "snippet": "Simplified introduction to {topic} designed for newcomers, breaking down complex ideas into digestible explanations.",
                    "source": "Learning Platform",
                    "citations": random.randint(20, 150)
                },
                {
                    "title": "Recent Advances in {topic}",
                    "snippet": "Survey of the latest developments and breakthrough research in {topic}, covering state-of-the-art techniques and future directions.",
                    "source": "Research Papers Archive",
                    "citations": random.randint(200, 2000)
                }
            ],
            "technical": [
                {
                    "title": "Technical Deep-Dive: {topic}",
                    "snippet": "Detailed technical analysis of {topic} architecture, implementation details, performance characteristics, and optimization strategies.",
                    "source": "Technical Specifications",
                    "citations": random.randint(150, 800)
                },
                {
                    "title": "{topic} Implementation Reference",
                    "snippet": "Complete reference implementation with benchmarks, complexity analysis, and comparison with alternative approaches.",
                    "source": "Engineering Documentation",
                    "citations": random.randint(80, 400)
                }
            ],
            "code": [
                {
                    "title": "{topic} Code Examples",
                    "snippet": "Annotated code samples demonstrating {topic} implementation patterns, with explanations of key design decisions and trade-offs.",
                    "source": "Code Repository",
                    "citations": random.randint(40, 300)
                },
                {
                    "title": "Building with {topic}: Tutorial",
                    "snippet": "Hands-on coding tutorial walking through {topic} implementation from scratch, including testing and debugging strategies.",
                    "source": "Developer Tutorials",
                    "citations": random.randint(60, 350)
                }
            ]
        }
        
        # Extract topic from query (simple heuristic)
        topic = self._extract_topic(query)
        
        # Select templates based on mode
        if mode == "technical":
            templates = doc_templates["technical"] + doc_templates["default"][:3]
        elif mode == "code":
            templates = doc_templates["code"] + doc_templates["default"][:3]
        else:
            templates = doc_templates["default"]
        
        # Fill templates with extracted topic
        docs = []
        for template in templates[:5]:  # Top 5 results
            docs.append({
                "title": template["title"].format(topic=topic.title()),
                "snippet": template["snippet"].format(topic=topic),
                "source": template["source"],
                "citations": template["citations"]
            })
        
        return docs
    
    def _extract_topic(self, query: str) -> str:
        """Extract main topic from query (simple keyword extraction)."""
        # Remove common question words
        stopwords = ["what", "how", "why", "when", "where", "who", "explain", "describe", "tell", "me", "about", "the", "is", "are", "can", "you", "do", "does"]
        words = query.lower().split()
        filtered = [w.strip("?.,!") for w in words if w not in stopwords and len(w) > 2]
        
        # Return multi-word topic or single word
        if len(filtered) >= 2:
            return " ".join(filtered[:3])  # Max 3 words
        elif len(filtered) == 1:
            return filtered[0]
        else:
            return "this concept"


def create_rag_pipeline(query: str, mode: str, response: str) -> Dict[str, Any]:
    """Create a complete RAG pipeline trace for visualization.
    
    Args:
        query: User's question
        mode: Learning mode (affects document selection)
        response: Generated response text
    
    Returns:
        Complete pipeline data with all stages
    """
    tracker = RAGTracker()
    
    # Stage 1: Query encoding
    tracker.track_query_encoding(query)
    
    # Stage 2: Retrieval
    retrieval_data = tracker.track_retrieval(query, mode)
    documents = retrieval_data["documents"]
    
    # Stage 3: Re-ranking
    reranking_data = tracker.track_reranking(documents)
    reranked_docs = reranking_data["reranked_documents"]
    
    # Stage 4: Generation with context
    tracker.track_generation(query, reranked_docs, response)
    
    return tracker.get_pipeline_summary()