File size: 4,708 Bytes
cd8c2bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import hashlib
import sqlite3
from typing import List, Dict, Any, Tuple
from .knowledge_base import KnowledgeBase
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class KnowledgeRetriever:
    """Retrieval-augmented generation system for educational content."""
    
    def __init__(self, knowledge_base: KnowledgeBase):
        self.kb = knowledge_base
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            ngram_range=(1, 2),
            max_features=1000
        )
        self._build_index()
    
    def _build_index(self):
        """Build TF-IDF index for semantic search."""
        # Get all knowledge items
        all_items = []
        with sqlite3.connect(self.kb.db_path) as conn:
            conn.row_factory = sqlite3.Row
            cursor = conn.execute("SELECT * FROM knowledge_items")
            for row in cursor.fetchall():
                all_items.append({
                    "id": row["id"],
                    "skill": row["skill"],
                    "content": row["content"],
                    "facts": eval(row["facts"]),
                    "difficulty": row["difficulty"]
                })
        
        self.all_items = all_items
        
        # Build corpus for vectorization
        corpus = []
        for item in self.all_items:
            text = f"{item['skill']} {item['content']} {' '.join(item['facts'])}"
            corpus.append(text)
        
        # Fit vectorizer
        self.tfidf_matrix = self.vectorizer.fit_transform(corpus)
    
    def retrieve_relevant_knowledge(self, query: str, skill: str = None, top_k: int = 3) -> List[Dict[str, Any]]:
        """Retrieve relevant knowledge items for a query."""
        # If skill is specified, prioritize skill-specific items
        if skill:
            skill_items = self.kb.retrieve_by_skill(skill, limit=top_k)
            if len(skill_items) >= top_k:
                return skill_items[:top_k]
        
        # Use semantic search
        query_vec = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
        
        # Get top-k most similar items
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        results = []
        for idx in top_indices:
            if similarities[idx] > 0.1:  # Threshold for relevance
                item = self.all_items[idx].copy()
                item["relevance_score"] = float(similarities[idx])
                results.append(item)
        
        return results
    
    def get_facts_for_explanation(self, question: str, user_answer: str, solution: str) -> List[str]:
        """Extract relevant facts for explaining a problem."""
        query = f"{question} {solution}"
        relevant_items = self.retrieve_relevant_knowledge(query, top_k=5)
        
        # Collect and deduplicate facts
        all_facts = []
        seen_facts = set()
        
        for item in relevant_items:
            for fact in item["facts"]:
                if fact not in seen_facts:
                    all_facts.append(fact)
                    seen_facts.add(fact)
        
        return all_facts[:5]  # Return top 5 most relevant facts
    
    def get_contextual_hints(self, question: str, hint_level: int = 1) -> List[str]:
        """Generate contextual hints based on retrieved knowledge."""
        relevant_items = self.retrieve_relevant_knowledge(question, top_k=3)
        
        if hint_level == 1:
            # Conceptual nudge
            hints = [item["content"].split('.')[0] + "." for item in relevant_items]
        elif hint_level == 2:
            # Procedural cue
            hints = [item["content"] for item in relevant_items]
        else:
            # Near-solution scaffold
            hints = []
            for item in relevant_items:
                for fact in item["facts"]:
                    if "step" in fact.lower() or "method" in fact.lower():
                        hints.append(fact)
        
        return hints[:3]
    
    def get_explanation_with_citations(self, question: str, user_answer: str, solution: str) -> Dict[str, Any]:
        """Generate explanation with knowledge citations."""
        facts = self.get_facts_for_explanation(question, user_answer, solution)
        relevant_items = self.retrieve_relevant_knowledge(f"{question} {solution}", top_k=3)
        
        return {
            "facts": facts,
            "citations": [{"id": item["id"], "skill": item["skill"]} for item in relevant_items],
            "sources": [item["content"] for item in relevant_items]
        }