Spaces:

BonelliLab
/

Eidolon-CognitiveTutor

Sleeping

File size: 11,813 Bytes
"""RAG Pipeline Tracker - Mock retrieval-augmented generation stages for visualization."""
import hashlib
import time
from typing import List, Dict, Any
import random


class RAGTracker:
    """Tracks and mocks RAG pipeline stages for educational visualization."""
    
    def __init__(self):
        self.stages: List[Dict[str, Any]] = []
        self.start_time = time.time()
    
    def track_query_encoding(self, query: str) -> Dict[str, Any]:
        """Stage 1: Encode query into embedding space."""
        # Generate deterministic pseudo-embedding from query
        query_hash = int(hashlib.md5(query.encode()).hexdigest()[:8], 16)
        random.seed(query_hash)
        
        # Mock 768-dim embedding (show first 10)
        embedding = [round(random.uniform(-1.0, 1.0), 3) for _ in range(768)]
        embedding_preview = embedding[:10]
        
        stage_data = {
            "stage": "query_encoding",
            "query": query,
            "embedding_dim": 768,
            "embedding_preview": embedding_preview,
            "encoding_method": "sentence-transformers (mock)",
            "timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
        }
        self.stages.append(stage_data)
        return stage_data
    
    def track_retrieval(self, query: str, mode: str = "standard") -> Dict[str, Any]:
        """Stage 2: Semantic search over knowledge base."""
        # Mock document retrieval with realistic-looking results
        query_lower = query.lower()
        
        # Generate contextually relevant mock documents
        docs = self._generate_mock_documents(query_lower, mode)
        
        # Simulate semantic similarity scores (deterministic based on query)
        query_hash = int(hashlib.md5(query.encode()).hexdigest()[:8], 16)
        random.seed(query_hash)
        
        for doc in docs:
            # Higher scores for better matches
            base_score = random.uniform(0.75, 0.95)
            doc["relevance_score"] = round(base_score, 3)
            doc["retrieval_method"] = "Dense Passage Retrieval (DPR)"
        
        # Sort by relevance
        docs.sort(key=lambda x: x["relevance_score"], reverse=True)
        
        stage_data = {
            "stage": "retrieval",
            "num_documents_searched": random.randint(50000, 500000),
            "top_k_retrieved": len(docs),
            "documents": docs,
            "search_time_ms": round(random.uniform(8, 25), 2),
            "timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
        }
        self.stages.append(stage_data)
        return stage_data
    
    def track_reranking(self, documents: List[Dict]) -> Dict[str, Any]:
        """Stage 3: Re-rank retrieved documents with cross-encoder."""
        # Simulate re-ranking: slight score adjustments
        reranked = []
        for i, doc in enumerate(documents[:5]):  # Only rerank top 5
            old_score = doc.get("relevance_score", 0.8)
            # Some docs improve, some decrease
            adjustment = random.uniform(-0.08, 0.12)
            new_score = min(0.99, max(0.60, old_score + adjustment))
            
            reranked.append({
                "title": doc["title"],
                "snippet": doc["snippet"],
                "old_score": old_score,
                "new_score": round(new_score, 3),
                "score_change": round(adjustment, 3),
                "reranker": "cross-encoder/ms-marco-MiniLM (mock)"
            })
        
        # Sort by new score
        reranked.sort(key=lambda x: x["new_score"], reverse=True)
        
        stage_data = {
            "stage": "reranking",
            "reranked_documents": reranked,
            "reranking_time_ms": round(random.uniform(15, 40), 2),
            "timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
        }
        self.stages.append(stage_data)
        return stage_data
    
    def track_generation(self, query: str, context_docs: List[Dict], response: str) -> Dict[str, Any]:
        """Stage 4: Generate response with attribution."""
        # Extract potential citations from top docs
        citations = []
        for i, doc in enumerate(context_docs[:3], 1):
            citations.append({
                "id": i,
                "title": doc.get("title", "Source"),
                "relevance": doc.get("new_score", doc.get("relevance_score", 0.8)),
                "used": random.choice([True, True, False])  # Most are used
            })
        
        stage_data = {
            "stage": "generation",
            "context_length": sum(len(d.get("snippet", "")) for d in context_docs),
            "num_context_docs": len(context_docs),
            "response_length": len(response),
            "citations": citations,
            "generation_time_ms": round(random.uniform(200, 800), 2),
            "timestamp_ms": round((time.time() - self.start_time) * 1000, 2)
        }
        self.stages.append(stage_data)
        return stage_data
    
    def get_pipeline_summary(self) -> Dict[str, Any]:
        """Get complete pipeline visualization data."""
        total_time = round((time.time() - self.start_time) * 1000, 2)
        return {
            "stages": self.stages,
            "total_time_ms": total_time,
            "pipeline_type": "RAG (Retrieval-Augmented Generation)",
            "components": [
                "Query Encoder (sentence-transformers)",
                "Vector Database (FAISS/approximate NN)",
                "Re-ranker (cross-encoder)",
                "Generator (LLM with context)"
            ]
        }
    
    def _generate_mock_documents(self, query: str, mode: str) -> List[Dict[str, str]]:
        """Generate contextually relevant mock documents based on query keywords."""
        # Database of mock document templates
        doc_templates = {
            "default": [
                {
                    "title": "Introduction to {topic}",
                    "snippet": "This comprehensive guide covers the fundamentals of {topic}, including key concepts, practical applications, and real-world examples.",
                    "source": "Educational Resources Database",
                    "citations": random.randint(50, 500)
                },
                {
                    "title": "Advanced Concepts in {topic}",
                    "snippet": "Exploring advanced techniques and methodologies in {topic}, with detailed analysis of current research and best practices.",
                    "source": "Academic Journal Repository",
                    "citations": random.randint(100, 1000)
                },
                {
                    "title": "{topic}: A Practical Guide",
                    "snippet": "Step-by-step tutorial demonstrating how to apply {topic} concepts in real-world scenarios, with code examples and case studies.",
                    "source": "Technical Documentation",
                    "citations": random.randint(30, 200)
                },
                {
                    "title": "Understanding {topic} for Beginners",
                    "snippet": "Simplified introduction to {topic} designed for newcomers, breaking down complex ideas into digestible explanations.",
                    "source": "Learning Platform",
                    "citations": random.randint(20, 150)
                },
                {
                    "title": "Recent Advances in {topic}",
                    "snippet": "Survey of the latest developments and breakthrough research in {topic}, covering state-of-the-art techniques and future directions.",
                    "source": "Research Papers Archive",
                    "citations": random.randint(200, 2000)
                }
            ],
            "technical": [
                {
                    "title": "Technical Deep-Dive: {topic}",
                    "snippet": "Detailed technical analysis of {topic} architecture, implementation details, performance characteristics, and optimization strategies.",
                    "source": "Technical Specifications",
                    "citations": random.randint(150, 800)
                },
                {
                    "title": "{topic} Implementation Reference",
                    "snippet": "Complete reference implementation with benchmarks, complexity analysis, and comparison with alternative approaches.",
                    "source": "Engineering Documentation",
                    "citations": random.randint(80, 400)
                }
            ],
            "code": [
                {
                    "title": "{topic} Code Examples",
                    "snippet": "Annotated code samples demonstrating {topic} implementation patterns, with explanations of key design decisions and trade-offs.",
                    "source": "Code Repository",
                    "citations": random.randint(40, 300)
                },
                {
                    "title": "Building with {topic}: Tutorial",
                    "snippet": "Hands-on coding tutorial walking through {topic} implementation from scratch, including testing and debugging strategies.",
                    "source": "Developer Tutorials",
                    "citations": random.randint(60, 350)
                }
            ]
        }
        
        # Extract topic from query (simple heuristic)
        topic = self._extract_topic(query)
        
        # Select templates based on mode
        if mode == "technical":
            templates = doc_templates["technical"] + doc_templates["default"][:3]
        elif mode == "code":
            templates = doc_templates["code"] + doc_templates["default"][:3]
        else:
            templates = doc_templates["default"]
        
        # Fill templates with extracted topic
        docs = []
        for template in templates[:5]:  # Top 5 results
            docs.append({
                "title": template["title"].format(topic=topic.title()),
                "snippet": template["snippet"].format(topic=topic),
                "source": template["source"],
                "citations": template["citations"]
            })
        
        return docs
    
    def _extract_topic(self, query: str) -> str:
        """Extract main topic from query (simple keyword extraction)."""
        # Remove common question words
        stopwords = ["what", "how", "why", "when", "where", "who", "explain", "describe", "tell", "me", "about", "the", "is", "are", "can", "you", "do", "does"]
        words = query.lower().split()
        filtered = [w.strip("?.,!") for w in words if w not in stopwords and len(w) > 2]
        
        # Return multi-word topic or single word
        if len(filtered) >= 2:
            return " ".join(filtered[:3])  # Max 3 words
        elif len(filtered) == 1:
            return filtered[0]
        else:
            return "this concept"


def create_rag_pipeline(query: str, mode: str, response: str) -> Dict[str, Any]:
    """Create a complete RAG pipeline trace for visualization.
    
    Args:
        query: User's question
        mode: Learning mode (affects document selection)
        response: Generated response text
    
    Returns:
        Complete pipeline data with all stages
    """
    tracker = RAGTracker()
    
    # Stage 1: Query encoding
    tracker.track_query_encoding(query)
    
    # Stage 2: Retrieval
    retrieval_data = tracker.track_retrieval(query, mode)
    documents = retrieval_data["documents"]
    
    # Stage 3: Re-ranking
    reranking_data = tracker.track_reranking(documents)
    reranked_docs = reranking_data["reranked_documents"]
    
    # Stage 4: Generation with context
    tracker.track_generation(query, reranked_docs, response)
    
    return tracker.get_pipeline_summary()