""" Vector database manager for NBA data using ChromaDB and sentence-transformers. """ import os import pandas as pd import chromadb from chromadb.config import Settings from sentence_transformers import SentenceTransformer from typing import List, Dict, Optional import json class NBAVectorDB: """ Manages vector embeddings and semantic search for NBA data. Uses sentence-transformers for embeddings and ChromaDB for storage. """ def __init__(self, csv_path: str, collection_name: str = "nba_data", persist_directory: str = "./chroma_db"): """ Initialize the vector database. Args: csv_path: Path to the NBA CSV file collection_name: Name of the ChromaDB collection persist_directory: Directory to persist the vector database """ self.csv_path = csv_path self.collection_name = collection_name self.persist_directory = persist_directory # Initialize embedding model (open-source, runs locally) # Using all-MiniLM-L6-v2: fast, good quality, 384 dimensions print("Loading embedding model...") self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2') print("Embedding model loaded!") # Initialize ChromaDB client os.makedirs(persist_directory, exist_ok=True) self.client = chromadb.PersistentClient( path=persist_directory, settings=Settings(anonymized_telemetry=False) ) # Get or create collection self.collection = self.client.get_or_create_collection( name=collection_name, metadata={"description": "NBA 2024-25 season data"} ) # Check if collection is empty and needs indexing if self.collection.count() == 0: print("Vector database is empty. Indexing CSV data...") self._index_csv() else: print(f"Vector database loaded with {self.collection.count()} records") def _create_text_representation(self, row: pd.Series) -> str: """ Convert a DataFrame row to a text representation for embedding. Args: row: A pandas Series representing a row Returns: str: Text representation of the row """ # Create a natural language description of the row parts = [] if 'Player' in row: parts.append(f"Player: {row['Player']}") if 'Tm' in row: parts.append(f"Team: {row['Tm']}") if 'Opp' in row: parts.append(f"Opponent: {row['Opp']}") if 'Res' in row: parts.append(f"Result: {'Win' if row['Res'] == 'W' else 'Loss'}") if 'PTS' in row and pd.notna(row['PTS']): parts.append(f"Points: {row['PTS']}") if 'AST' in row and pd.notna(row['AST']): parts.append(f"Assists: {row['AST']}") if 'TRB' in row and pd.notna(row['TRB']): parts.append(f"Rebounds: {row['TRB']}") if 'FG%' in row and pd.notna(row['FG%']): parts.append(f"Field Goal Percentage: {row['FG%']:.3f}") if '3P%' in row and pd.notna(row['3P%']): parts.append(f"Three Point Percentage: {row['3P%']:.3f}") if 'Data' in row: parts.append(f"Date: {row['Data']}") return ". ".join(parts) def _index_csv(self): """ Read CSV file, create embeddings, and store in ChromaDB. """ print(f"Reading CSV from {self.csv_path}...") df = pd.read_csv(self.csv_path) print(f"Creating embeddings for {len(df)} records...") texts = [] metadatas = [] ids = [] # Process in batches for efficiency batch_size = 100 total_batches = (len(df) + batch_size - 1) // batch_size for batch_idx in range(0, len(df), batch_size): batch_df = df.iloc[batch_idx:batch_idx + batch_size] batch_num = (batch_idx // batch_size) + 1 batch_texts = [] batch_metadatas = [] batch_ids = [] for idx, row in batch_df.iterrows(): # Create text representation text = self._create_text_representation(row) batch_texts.append(text) # Store metadata (original row data as JSON) metadata = { 'row_index': int(idx), 'player': str(row.get('Player', '')), 'team': str(row.get('Tm', '')), 'opponent': str(row.get('Opp', '')), 'result': str(row.get('Res', '')), 'points': float(row.get('PTS', 0)) if pd.notna(row.get('PTS')) else 0.0, 'date': str(row.get('Data', '')), } batch_metadatas.append(metadata) batch_ids.append(f"row_{idx}") # Generate embeddings for this batch print(f"Processing batch {batch_num}/{total_batches} ({len(batch_texts)} records)...") embeddings = self.embedding_model.encode( batch_texts, show_progress_bar=False, convert_to_numpy=True ).tolist() # Add to ChromaDB self.collection.add( embeddings=embeddings, documents=batch_texts, metadatas=batch_metadatas, ids=batch_ids ) texts.extend(batch_texts) metadatas.extend(batch_metadatas) ids.extend(batch_ids) print(f"Successfully indexed {len(df)} records in vector database!") def search(self, query: str, n_results: int = 10) -> List[Dict]: """ Perform semantic search on the NBA data. Args: query: Natural language query n_results: Number of results to return Returns: List of dictionaries containing search results with metadata """ # Generate embedding for the query query_embedding = self.embedding_model.encode( query, convert_to_numpy=True ).tolist() # Search in ChromaDB results = self.collection.query( query_embeddings=[query_embedding], n_results=n_results, include=['documents', 'metadatas', 'distances'] ) # Format results formatted_results = [] if results['ids'] and len(results['ids'][0]) > 0: for i in range(len(results['ids'][0])): formatted_results.append({ 'id': results['ids'][0][i], 'document': results['documents'][0][i], 'metadata': results['metadatas'][0][i], 'distance': results['distances'][0][i], 'similarity': 1 - results['distances'][0][i] # Convert distance to similarity }) return formatted_results def get_original_row(self, row_index: int) -> Optional[pd.Series]: """ Retrieve the original CSV row by index. Args: row_index: Index of the row in the original CSV Returns: pandas Series or None if not found """ try: df = pd.read_csv(self.csv_path) if 0 <= row_index < len(df): return df.iloc[row_index] except Exception as e: print(f"Error retrieving row {row_index}: {e}") return None # Global instance (will be initialized when needed) _vector_db_instance: Optional[NBAVectorDB] = None def get_vector_db(csv_path: str) -> NBAVectorDB: """ Get or create the global vector database instance. Args: csv_path: Path to the CSV file Returns: NBAVectorDB instance """ global _vector_db_instance if _vector_db_instance is None or _vector_db_instance.csv_path != csv_path: _vector_db_instance = NBAVectorDB(csv_path) return _vector_db_instance