Spaces:
Sleeping
Sleeping
| """ | |
| Database retrieval tools for GAIA question similarity search. | |
| Connects to Supabase database to find similar questions and answers. | |
| Combines efficiency of LangChain SupabaseVectorStore with custom logic. | |
| """ | |
| import os | |
| import json | |
| from typing import List, Dict, Optional, Tuple | |
| from supabase import create_client, Client | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import SupabaseVectorStore | |
| from langchain_core.tools import tool | |
| class GAIADatabaseRetriever: | |
| """Handles similarity search against the GAIA Q&A database with dual embedding support.""" | |
| def __init__(self, use_huggingface: bool = True): | |
| # Initialize Supabase client | |
| self.supabase_url = os.getenv("SUPABASE_URL") | |
| self.supabase_key = os.getenv("SUPABASE_SERVICE_KEY") or os.getenv("SUPABASE_KEY") | |
| if not self.supabase_url or not self.supabase_key: | |
| raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_KEY (or SUPABASE_KEY) must be set in environment variables") | |
| self.supabase: Client = create_client(self.supabase_url, self.supabase_key) | |
| # Choose embedding model | |
| if use_huggingface: | |
| try: | |
| # Use HuggingFace embeddings (free and often better for similarity) | |
| self.embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-mpnet-base-v2" | |
| ) | |
| self.embedding_model = "huggingface" | |
| except ImportError: | |
| print("⚠️ HuggingFace embeddings not available, falling back to OpenAI") | |
| self.embeddings = OpenAIEmbeddings( | |
| model="text-embedding-3-small", | |
| openai_api_key=os.getenv("OPENAI_API_KEY") | |
| ) | |
| self.embedding_model = "openai" | |
| else: | |
| # Use OpenAI embeddings | |
| self.embeddings = OpenAIEmbeddings( | |
| model="text-embedding-3-small", | |
| openai_api_key=os.getenv("OPENAI_API_KEY") | |
| ) | |
| self.embedding_model = "openai" | |
| # Initialize vector store | |
| try: | |
| self.vector_store = SupabaseVectorStore( | |
| client=self.supabase, | |
| embedding=self.embeddings, | |
| table_name="documents", | |
| query_name="match_documents_langchain", # Assumes you have this function | |
| ) | |
| self.use_vector_store = True | |
| except Exception as e: | |
| print(f"⚠️ Vector store not available: {e}") | |
| print("Falling back to manual similarity search") | |
| self.use_vector_store = False | |
| def search_similar_questions_efficient(self, question: str, top_k: int = 3) -> List[Dict]: | |
| """ | |
| Efficient search using LangChain SupabaseVectorStore. | |
| """ | |
| try: | |
| if not self.use_vector_store: | |
| return self.search_similar_questions_manual(question, top_k) | |
| # Use LangChain's efficient vector search | |
| docs = self.vector_store.similarity_search(question, k=top_k) | |
| similar_docs = [] | |
| for doc in docs: | |
| page_content = doc.page_content | |
| # Extract question and answer from page_content | |
| if 'Q:' in page_content and 'A:' in page_content: | |
| parts = page_content.split('A:') | |
| if len(parts) >= 2: | |
| question_part = parts[0].replace('Q:', '').strip() | |
| answer_part = parts[1].strip() | |
| similar_docs.append({ | |
| 'id': doc.metadata.get('id', 'unknown'), | |
| 'question': question_part, | |
| 'answer': answer_part, | |
| 'similarity': doc.metadata.get('similarity', 0.8), # Estimated | |
| 'page_content': page_content | |
| }) | |
| return similar_docs | |
| except Exception as e: | |
| print(f"Error in efficient search: {e}") | |
| return self.search_similar_questions_manual(question, top_k) | |
| def search_similar_questions_manual(self, question: str, top_k: int = 3, similarity_threshold: float = 0.75) -> List[Dict]: | |
| """ | |
| Fallback manual search with precise similarity scoring. | |
| """ | |
| try: | |
| # Get embedding for the input question | |
| query_embedding = self.embeddings.embed_query(question) | |
| # Fetch all documents from Supabase | |
| response = self.supabase.table("documents").select("*").execute() | |
| if not response.data: | |
| return [] | |
| # Calculate similarities manually | |
| similar_docs = [] | |
| for doc in response.data: | |
| # Parse the stored embedding | |
| try: | |
| stored_embedding = json.loads(doc['embedding']) | |
| except: | |
| continue | |
| # Calculate cosine similarity (manual implementation) | |
| dot_product = sum(a * b for a, b in zip(query_embedding, stored_embedding)) | |
| norm_a = sum(a * a for a in query_embedding) ** 0.5 | |
| norm_b = sum(b * b for b in stored_embedding) ** 0.5 | |
| if norm_a == 0 or norm_b == 0: | |
| continue | |
| similarity = dot_product / (norm_a * norm_b) | |
| # Extract question and answer from page_content | |
| page_content = doc['page_content'] | |
| if 'Q:' in page_content and 'A:' in page_content: | |
| parts = page_content.split('A:') | |
| if len(parts) >= 2: | |
| question_part = parts[0].replace('Q:', '').strip() | |
| answer_part = parts[1].strip() | |
| if similarity >= similarity_threshold: | |
| similar_docs.append({ | |
| 'id': doc['id'], | |
| 'question': question_part, | |
| 'answer': answer_part, | |
| 'similarity': float(similarity), | |
| 'page_content': page_content | |
| }) | |
| # Sort by similarity | |
| similar_docs.sort(key=lambda x: x['similarity'], reverse=True) | |
| return similar_docs[:top_k] | |
| except Exception as e: | |
| print(f"Error in manual search: {e}") | |
| return [] | |
| # Initialize the retriever lazily to avoid import errors when env vars are missing | |
| retriever = None | |
| def get_retriever(): | |
| """Get the database retriever, initializing it if needed.""" | |
| global retriever | |
| if retriever is None: | |
| retriever = GAIADatabaseRetriever(use_huggingface=True) | |
| return retriever | |
| def create_retriever_from_supabase(query: str) -> str: | |
| """ | |
| Search for similar documents in the Supabase vector store using efficient LangChain integration. | |
| This tool uses semantic search to find documents that are semantically similar to the provided query. | |
| Args: | |
| query (str): The search query to find similar documents. | |
| Returns: | |
| str: A formatted list of documents that are semantically similar to the query. | |
| """ | |
| try: | |
| retriever = get_retriever() | |
| similar_questions = retriever.search_similar_questions_efficient(query, top_k=3) | |
| if not similar_questions: | |
| return "No similar questions found in the database." | |
| result = f"Found {len(similar_questions)} similar questions:\n\n" | |
| for i, doc in enumerate(similar_questions, 1): | |
| result += f"Similar Question {i}:\n" | |
| result += f"Q: {doc['question']}\n" | |
| result += f"A: {doc['answer']}\n" | |
| result += "-" * 50 + "\n" | |
| return result | |
| except Exception as e: | |
| return f"Error searching database: {str(e)}" | |
| def search_similar_gaia_questions(question: str, max_results: int = 3) -> str: | |
| """ | |
| Search for similar GAIA questions in the database with precise similarity scoring. | |
| Args: | |
| question: The question to search for | |
| max_results: Maximum number of similar questions to return (default: 3) | |
| Returns: | |
| Formatted string with similar questions and their answers | |
| """ | |
| try: | |
| retriever = get_retriever() | |
| similar_questions = retriever.search_similar_questions_manual( | |
| question, | |
| top_k=max_results, | |
| similarity_threshold=0.75 | |
| ) | |
| if not similar_questions: | |
| return "No similar questions found in the database." | |
| result = f"Found {len(similar_questions)} similar questions:\n\n" | |
| for i, doc in enumerate(similar_questions, 1): | |
| result += f"Similar Question {i} (Similarity: {doc['similarity']:.3f}):\n" | |
| result += f"Q: {doc['question']}\n" | |
| result += f"A: {doc['answer']}\n" | |
| result += "-" * 50 + "\n" | |
| return result | |
| except Exception as e: | |
| return f"Error searching database: {str(e)}" | |
| def get_exact_answer_if_highly_similar(question: str, similarity_threshold: float = 0.95) -> str: | |
| """ | |
| Get the exact answer if a highly similar question exists in the database. | |
| Args: | |
| question: The question to search for | |
| similarity_threshold: High threshold for considering an exact match (default: 0.95) | |
| Returns: | |
| The answer if found, or indication that no exact match exists | |
| """ | |
| try: | |
| retriever = get_retriever() | |
| similar_questions = retriever.search_similar_questions_manual( | |
| question, | |
| top_k=1, | |
| similarity_threshold=similarity_threshold | |
| ) | |
| if similar_questions: | |
| best_match = similar_questions[0] | |
| return f"EXACT_MATCH_FOUND: {best_match['answer']}" | |
| else: | |
| return "NO_EXACT_MATCH: Proceed with normal agent processing" | |
| except Exception as e: | |
| return f"Error checking for exact match: {str(e)}" | |
| # Export tools for use in agents - include both approaches | |
| DATABASE_TOOLS = [ | |
| create_retriever_from_supabase, # Efficient LangChain approach | |
| search_similar_gaia_questions, # Precise similarity scoring | |
| get_exact_answer_if_highly_similar # Exact match detection | |
| ] | |