import hashlib import sqlite3 from typing import List, Dict, Any, Tuple from .knowledge_base import KnowledgeBase import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity class KnowledgeRetriever: """Retrieval-augmented generation system for educational content.""" def __init__(self, knowledge_base: KnowledgeBase): self.kb = knowledge_base self.vectorizer = TfidfVectorizer( stop_words='english', ngram_range=(1, 2), max_features=1000 ) self._build_index() def _build_index(self): """Build TF-IDF index for semantic search.""" # Get all knowledge items all_items = [] with sqlite3.connect(self.kb.db_path) as conn: conn.row_factory = sqlite3.Row cursor = conn.execute("SELECT * FROM knowledge_items") for row in cursor.fetchall(): all_items.append({ "id": row["id"], "skill": row["skill"], "content": row["content"], "facts": eval(row["facts"]), "difficulty": row["difficulty"] }) self.all_items = all_items # Build corpus for vectorization corpus = [] for item in self.all_items: text = f"{item['skill']} {item['content']} {' '.join(item['facts'])}" corpus.append(text) # Fit vectorizer self.tfidf_matrix = self.vectorizer.fit_transform(corpus) def retrieve_relevant_knowledge(self, query: str, skill: str = None, top_k: int = 3) -> List[Dict[str, Any]]: """Retrieve relevant knowledge items for a query.""" # If skill is specified, prioritize skill-specific items if skill: skill_items = self.kb.retrieve_by_skill(skill, limit=top_k) if len(skill_items) >= top_k: return skill_items[:top_k] # Use semantic search query_vec = self.vectorizer.transform([query]) similarities = cosine_similarity(query_vec, self.tfidf_matrix).flatten() # Get top-k most similar items top_indices = np.argsort(similarities)[-top_k:][::-1] results = [] for idx in top_indices: if similarities[idx] > 0.1: # Threshold for relevance item = self.all_items[idx].copy() item["relevance_score"] = float(similarities[idx]) results.append(item) return results def get_facts_for_explanation(self, question: str, user_answer: str, solution: str) -> List[str]: """Extract relevant facts for explaining a problem.""" query = f"{question} {solution}" relevant_items = self.retrieve_relevant_knowledge(query, top_k=5) # Collect and deduplicate facts all_facts = [] seen_facts = set() for item in relevant_items: for fact in item["facts"]: if fact not in seen_facts: all_facts.append(fact) seen_facts.add(fact) return all_facts[:5] # Return top 5 most relevant facts def get_contextual_hints(self, question: str, hint_level: int = 1) -> List[str]: """Generate contextual hints based on retrieved knowledge.""" relevant_items = self.retrieve_relevant_knowledge(question, top_k=3) if hint_level == 1: # Conceptual nudge hints = [item["content"].split('.')[0] + "." for item in relevant_items] elif hint_level == 2: # Procedural cue hints = [item["content"] for item in relevant_items] else: # Near-solution scaffold hints = [] for item in relevant_items: for fact in item["facts"]: if "step" in fact.lower() or "method" in fact.lower(): hints.append(fact) return hints[:3] def get_explanation_with_citations(self, question: str, user_answer: str, solution: str) -> Dict[str, Any]: """Generate explanation with knowledge citations.""" facts = self.get_facts_for_explanation(question, user_answer, solution) relevant_items = self.retrieve_relevant_knowledge(f"{question} {solution}", top_k=3) return { "facts": facts, "citations": [{"id": item["id"], "skill": item["skill"]} for item in relevant_items], "sources": [item["content"] for item in relevant_items] }