Spaces:

tomvaillant
/

graphics-llm

Running

graphics-llm / src /vectorstore.py

Tom

Update to Jina-CLIP-v2 embeddings and rebrand to Viz LLM

2d671a2 about 1 month ago

11.3 kB

	"""Supabase PGVector connection and retrieval functionality for graphics/design documents"""

	import os
	from typing import List, Dict, Any, Optional
	from supabase import create_client, Client
	from huggingface_hub import InferenceClient


	class Document:
	"""Simple document class to match LangChain interface"""

	def __init__(self, page_content: str, metadata: dict):
	self.page_content = page_content
	self.metadata = metadata


	class GraphicsVectorStore:
	"""Manages connection to Supabase PGVector database with graphics/design document embeddings"""

	def __init__(
	self,
	supabase_url: Optional[str] = None,
	supabase_key: Optional[str] = None,
	hf_token: Optional[str] = None,
	jina_api_key: Optional[str] = None,
	embedding_model: str = "jina-clip-v2"
	):
	"""
	Initialize the vector store connection

	Args:
	supabase_url: Supabase project URL (defaults to SUPABASE_URL env var)
	supabase_key: Supabase anon key (defaults to SUPABASE_KEY env var)
	hf_token: HuggingFace API token (defaults to HF_TOKEN env var)
	jina_api_key: Jina AI API key (defaults to JINA_API_KEY env var, required for Jina models)
	embedding_model: Embedding model to use (default: jinaai/jina-clip-v2)
	"""
	# Get credentials from parameters or environment
	self.supabase_url = supabase_url or os.getenv("SUPABASE_URL")
	self.supabase_key = supabase_key or os.getenv("SUPABASE_KEY")
	self.hf_token = hf_token or os.getenv("HF_TOKEN")
	self.jina_api_key = jina_api_key or os.getenv("JINA_API_KEY")

	if not self.supabase_url or not self.supabase_key:
	raise ValueError("SUPABASE_URL and SUPABASE_KEY environment variables must be set")

	# Check for appropriate API key based on model
	self.embedding_model = embedding_model
	if "jina" in self.embedding_model.lower():
	if not self.jina_api_key:
	raise ValueError("JINA_API_KEY environment variable must be set for Jina models")
	else:
	if not self.hf_token:
	raise ValueError("HF_TOKEN environment variable must be set for HuggingFace models")

	# Initialize Supabase client
	self.supabase: Client = create_client(self.supabase_url, self.supabase_key)

	# Initialize HuggingFace Inference client for embeddings (if using HF models)
	if self.hf_token:
	self.hf_client = InferenceClient(token=self.hf_token)

	def _generate_embedding(self, text: str) -> List[float]:
	"""
	Generate embedding for text using HuggingFace Inference API

	Args:
	text: Text to embed

	Returns:
	List of floats representing the embedding vector (1024 dimensions)
	"""
	try:
	# For Jina-CLIP-v2, use the Jina AI Embeddings API
	import requests
	import numpy as np

	# Jina AI uses their own API endpoint
	api_url = "https://api.jina.ai/v1/embeddings"
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {self.jina_api_key}"
	}
	payload = {
	"model": self.embedding_model,
	"input": [text]
	}

	response = requests.post(api_url, headers=headers, json=payload, timeout=30)

	if response.status_code != 200:
	raise Exception(f"API returned status {response.status_code}: {response.text}")

	result = response.json()

	# Jina API returns embeddings in data array
	if isinstance(result, dict) and 'data' in result:
	embedding = result['data'][0]['embedding']
	return embedding

	# Fallback to standard response parsing
	result = result if not isinstance(result, dict) else result.get('embeddings', result)

	# Convert to list (handles numpy arrays and nested lists)
	# If it's a numpy array, convert to list
	if isinstance(result, np.ndarray):
	if result.ndim > 1:
	result = result[0] # Take first row if 2D
	return result.tolist()

	# If it's a nested list, flatten if needed
	if isinstance(result, list) and len(result) > 0:
	if isinstance(result[0], list):
	return result[0] # Take first embedding if batched
	# Handle nested numpy arrays in list
	if isinstance(result[0], np.ndarray):
	return result[0].tolist()
	return result

	return result
	except Exception as e:
	raise Exception(f"Error generating embedding with {self.embedding_model}: {str(e)}")

	def similarity_search(
	self,
	query: str,
	k: int = 5,
	match_threshold: float = 0.3
	) -> List[Document]:
	"""
	Perform similarity search on the graphics/design document database

	Args:
	query: Search query
	k: Number of results to return
	match_threshold: Minimum similarity threshold (0.0 to 1.0)

	Returns:
	List of Document objects with relevant document chunks
	"""
	# Generate embedding for query
	query_embedding = self._generate_embedding(query)

	# Call RPC function
	try:
	response = self.supabase.rpc(
	'match_documents',
	{
	'query_embedding': query_embedding,
	'match_threshold': match_threshold,
	'match_count': k
	}
	).execute()

	# Convert results to Document objects
	documents = []
	for item in response.data:
	# Handle None chunk_text
	chunk_text = item.get('chunk_text') or ''

	doc = Document(
	page_content=chunk_text,
	metadata={
	'id': item.get('id'),
	'source_type': item.get('source_type'),
	'source_id': item.get('source_id'),
	'title': item.get('title', ''),
	'content_type': item.get('content_type'),
	'chunk_index': item.get('chunk_index'),
	'page_number': item.get('page_number'),
	'word_count': item.get('word_count'),
	'metadata': item.get('metadata', {}),
	'similarity': item.get('similarity')
	}
	)
	documents.append(doc)

	return documents

	except Exception as e:
	raise Exception(f"Error performing similarity search: {str(e)}")

	def similarity_search_with_score(
	self,
	query: str,
	k: int = 5
	) -> List[tuple]:
	"""
	Perform similarity search and return documents with relevance scores

	Args:
	query: Search query
	k: Number of results to return

	Returns:
	List of tuples (Document, score)
	"""
	# Generate embedding for query
	query_embedding = self._generate_embedding(query)

	# Call RPC function
	try:
	response = self.supabase.rpc(
	'match_documents',
	{
	'query_embedding': query_embedding,
	'match_threshold': 0.0, # Get all matches
	'match_count': k
	}
	).execute()

	# Convert results to Document objects with scores
	results = []
	for item in response.data:
	# Handle None chunk_text
	chunk_text = item.get('chunk_text') or ''

	doc = Document(
	page_content=chunk_text,
	metadata={
	'id': item.get('id'),
	'source_type': item.get('source_type'),
	'source_id': item.get('source_id'),
	'title': item.get('title', ''),
	'content_type': item.get('content_type'),
	'chunk_index': item.get('chunk_index'),
	'page_number': item.get('page_number'),
	'word_count': item.get('word_count'),
	'metadata': item.get('metadata', {})
	}
	)
	score = item.get('similarity', 0.0)
	results.append((doc, score))

	return results

	except Exception as e:
	raise Exception(f"Error performing similarity search: {str(e)}")

	def get_retriever(self, k: int = 5):
	"""
	Get a retriever-like object for LangChain compatibility

	Args:
	k: Number of results to return

	Returns:
	Simple retriever object with get_relevant_documents method
	"""
	class SimpleRetriever:
	def __init__(self, vectorstore, k):
	self.vectorstore = vectorstore
	self.k = k

	def get_relevant_documents(self, query: str) -> List[Document]:
	return self.vectorstore.similarity_search(query, k=self.k)

	return SimpleRetriever(self, k)

	def format_documents_for_context(self, documents: List[Document]) -> str:
	"""
	Format retrieved documents for inclusion in LLM context

	Args:
	documents: List of retrieved Document objects

	Returns:
	Formatted string with document information
	"""
	formatted_docs = []

	for i, doc in enumerate(documents, 1):
	metadata = doc.metadata
	source_info = f"Source: {metadata.get('source_id', 'Unknown')}"
	if metadata.get('page_number'):
	source_info += f" (Page {metadata.get('page_number')})"

	doc_info = f"""
	Document {i}: {source_info}
	Type: {metadata.get('source_type', 'N/A')} \| Content: {metadata.get('content_type', 'text')}
	{doc.page_content}
	"""
	formatted_docs.append(doc_info.strip())

	return "\n\n---\n\n".join(formatted_docs)

	def get_source_types(self) -> List[str]:
	"""Get list of available source types from database"""
	try:
	response = self.supabase.table('document_embeddings')\
	.select('source_type')\
	.execute()

	# Extract unique source types
	source_types = set()
	for item in response.data:
	if item.get('source_type'):
	source_types.add(item['source_type'])

	return sorted(list(source_types))

	except Exception as e:
	# Return common source types as fallback
	return [
	"pdf",
	"url",
	"image"
	]


	def create_vectorstore() -> GraphicsVectorStore:
	"""Factory function to create and return a configured vector store"""
	return GraphicsVectorStore()