#!/usr/bin/env python3 """ Incremental Document Addition for VedaMD Vector Store ====================================================== This script allows you to add single documents to an existing vector store without rebuilding the entire index. Features: - Process single PDF file - Detect duplicates (hash-based) - Add to existing FAISS index - Update metadata - Incremental upload to HF Hub - No full rebuild required Usage: python scripts/add_document.py \\ --file ./new_guideline.pdf \\ --citation "SLCOG Hypertension Guidelines 2025" \\ --vector-store-dir ./data/vector_store \\ --upload Author: VedaMD Team Date: October 22, 2025 Version: 1.0.0 """ import os import sys import json import hashlib import logging import argparse from pathlib import Path from typing import Dict, Optional, List from datetime import datetime import warnings # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) # Import from build_vector_store try: from build_vector_store import PDFExtractor, MedicalChunker except ImportError: # If running standalone, define minimal versions logger = logging.getLogger(__name__) logger.error("Cannot import from build_vector_store.py. Make sure it's in the same directory.") sys.exit(1) # Embeddings and vector store try: from sentence_transformers import SentenceTransformer import faiss import numpy as np HAS_EMBEDDINGS = True except ImportError: HAS_EMBEDDINGS = False raise ImportError("Required packages not installed. Run: pip install sentence-transformers faiss-cpu numpy") # Hugging Face Hub try: from huggingface_hub import HfApi HAS_HF = True except ImportError: HAS_HF = False warnings.warn("Hugging Face Hub not available. Install with: pip install huggingface-hub") # Setup logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler('add_document.log') ] ) logger = logging.getLogger(__name__) class DocumentAdder: """Add documents incrementally to existing vector store""" def __init__(self, vector_store_dir: str): self.vector_store_dir = Path(vector_store_dir) if not self.vector_store_dir.exists(): raise FileNotFoundError(f"Vector store directory not found: {self.vector_store_dir}") logger.info(f"šŸ“ Vector store directory: {self.vector_store_dir}") # Load existing vector store self.load_vector_store() def load_vector_store(self): """Load existing vector store from disk""" logger.info("šŸ“„ Loading existing vector store...") # Load config config_path = self.vector_store_dir / "config.json" if not config_path.exists(): raise FileNotFoundError(f"Config file not found: {config_path}") with open(config_path, 'r') as f: self.config = json.load(f) logger.info(f"āœ… Loaded config: {self.config['embedding_model']}") # Load FAISS index index_path = self.vector_store_dir / "faiss_index.bin" if not index_path.exists(): raise FileNotFoundError(f"FAISS index not found: {index_path}") self.index = faiss.read_index(str(index_path)) logger.info(f"āœ… Loaded FAISS index: {self.index.ntotal} vectors") # Load documents docs_path = self.vector_store_dir / "documents.json" if not docs_path.exists(): raise FileNotFoundError(f"Documents file not found: {docs_path}") with open(docs_path, 'r', encoding='utf-8') as f: self.documents = json.load(f) logger.info(f"āœ… Loaded {len(self.documents)} documents") # Load metadata metadata_path = self.vector_store_dir / "metadata.json" if not metadata_path.exists(): raise FileNotFoundError(f"Metadata file not found: {metadata_path}") with open(metadata_path, 'r', encoding='utf-8') as f: self.metadata = json.load(f) logger.info(f"āœ… Loaded {len(self.metadata)} metadata entries") # Load embedding model logger.info(f"šŸ¤– Loading embedding model: {self.config['embedding_model']}") self.embedding_model = SentenceTransformer(self.config['embedding_model']) self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension() if self.embedding_dim != self.config['embedding_dim']: raise ValueError( f"Embedding dimension mismatch! " f"Expected {self.config['embedding_dim']}, got {self.embedding_dim}" ) logger.info(f"āœ… Embedding model loaded (dim={self.embedding_dim})") # Initialize chunker self.chunker = MedicalChunker( chunk_size=self.config.get('chunk_size', 1000), chunk_overlap=self.config.get('chunk_overlap', 100) ) def check_duplicate(self, file_hash: str, filename: str) -> bool: """Check if document already exists in vector store""" logger.info(f"šŸ” Checking for duplicates...") for meta in self.metadata: if meta.get('file_hash') == file_hash: logger.warning(f"āš ļø Duplicate detected: {meta['source']} (hash: {file_hash[:8]}...)") return True # Also check by filename if meta.get('source') == filename: logger.warning(f"āš ļø File with same name exists: {filename}") # Don't return True here - might be updated version logger.info(f" Continuing anyway (different content)") logger.info(f"āœ… No duplicates found") return False def add_document( self, pdf_path: str, citation: Optional[str] = None, category: Optional[str] = None, skip_duplicates: bool = True ) -> int: """Add a single document to the vector store""" pdf_path = Path(pdf_path) if not pdf_path.exists(): raise FileNotFoundError(f"PDF file not found: {pdf_path}") logger.info(f"\n{'='*60}") logger.info(f"šŸ“„ Adding document: {pdf_path.name}") logger.info(f"{'='*60}") try: # Extract text text, extraction_metadata = PDFExtractor.extract_text(str(pdf_path)) if not text or len(text) < 100: logger.warning(f"āš ļø Extracted text too short ({len(text)} chars), skipping") return 0 # Generate file hash file_hash = hashlib.md5(text.encode()).hexdigest() logger.info(f"šŸ”‘ File hash: {file_hash[:16]}...") # Check for duplicates if skip_duplicates and self.check_duplicate(file_hash, pdf_path.name): logger.warning(f"āš ļø Skipping duplicate document") return 0 # Chunk text chunks = self.chunker.chunk_text(text, pdf_path.name) if not chunks: logger.warning(f"āš ļø No chunks created from {pdf_path.name}") return 0 logger.info(f"šŸ“ Created {len(chunks)} chunks") # Generate embeddings logger.info(f"🧮 Generating embeddings...") chunk_texts = [chunk["content"] for chunk in chunks] chunk_embeddings = self.embedding_model.encode( chunk_texts, show_progress_bar=True, batch_size=32 ) # Add to FAISS index logger.info(f"šŸ“Š Adding to FAISS index...") embeddings_array = np.array(chunk_embeddings).astype('float32') self.index.add(embeddings_array) # Add documents and metadata base_chunk_id = len(self.documents) for i, (chunk, embedding) in enumerate(zip(chunks, chunk_embeddings)): self.documents.append(chunk["content"]) self.metadata.append({ "source": pdf_path.name, "section": chunk["section"], "chunk_id": base_chunk_id + i, "chunk_size": chunk["size"], "file_hash": file_hash, "extraction_method": extraction_metadata["method"], "total_pages": extraction_metadata["pages"], "citation": citation or pdf_path.name, "category": category or "General", "added_at": datetime.now().isoformat(), "added_by": "add_document.py" }) logger.info(f"āœ… Added {len(chunks)} chunks to vector store") logger.info(f"šŸ“Š New total: {self.index.ntotal} vectors") return len(chunks) except Exception as e: logger.error(f"āŒ Error adding document: {e}") raise def save_vector_store(self): """Save updated vector store to disk""" logger.info(f"\n{'='*60}") logger.info(f"šŸ’¾ Saving updated vector store...") logger.info(f"{'='*60}") # Backup existing files first backup_dir = self.vector_store_dir / "backups" / datetime.now().strftime("%Y%m%d_%H%M%S") backup_dir.mkdir(parents=True, exist_ok=True) for filename in ["faiss_index.bin", "documents.json", "metadata.json"]: src = self.vector_store_dir / filename if src.exists(): dst = backup_dir / filename import shutil shutil.copy2(src, dst) logger.info(f"šŸ“¦ Backup created: {backup_dir}") # Save FAISS index index_path = self.vector_store_dir / "faiss_index.bin" faiss.write_index(self.index, str(index_path)) logger.info(f"āœ… Saved FAISS index: {index_path}") # Save documents docs_path = self.vector_store_dir / "documents.json" with open(docs_path, 'w', encoding='utf-8') as f: json.dump(self.documents, f, ensure_ascii=False, indent=2) logger.info(f"āœ… Saved documents: {docs_path}") # Save metadata metadata_path = self.vector_store_dir / "metadata.json" with open(metadata_path, 'w', encoding='utf-8') as f: json.dump(self.metadata, f, ensure_ascii=False, indent=2) logger.info(f"āœ… Saved metadata: {metadata_path}") # Update config self.config["total_documents"] = len(self.documents) self.config["total_chunks"] = len(self.documents) self.config["last_updated"] = datetime.now().isoformat() config_path = self.vector_store_dir / "config.json" with open(config_path, 'w', encoding='utf-8') as f: json.dump(self.config, f, indent=2) logger.info(f"āœ… Updated config: {config_path}") def upload_to_hf(self, repo_id: str, token: Optional[str] = None): """Upload updated vector store to Hugging Face Hub""" if not HAS_HF: logger.warning("āš ļø Hugging Face Hub not available, skipping upload") return logger.info(f"\n{'='*60}") logger.info(f"ā˜ļø Uploading to Hugging Face Hub...") logger.info(f"šŸ“¦ Repository: {repo_id}") logger.info(f"{'='*60}") try: api = HfApi(token=token) # Upload updated files files_to_upload = [ "faiss_index.bin", "documents.json", "metadata.json", "config.json" ] for filename in files_to_upload: file_path = self.vector_store_dir / filename if file_path.exists(): logger.info(f"šŸ“¤ Uploading {filename}...") api.upload_file( path_or_fileobj=str(file_path), path_in_repo=filename, repo_id=repo_id, repo_type="dataset", token=token ) logger.info(f"āœ… Uploaded {filename}") logger.info(f"šŸŽ‰ Upload complete! View at: https://huggingface.co/datasets/{repo_id}") except Exception as e: logger.error(f"āŒ Upload failed: {e}") raise def main(): parser = argparse.ArgumentParser( description="Add a document to existing VedaMD Vector Store", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Add document locally python scripts/add_document.py \\ --file ./guidelines/new_protocol.pdf \\ --citation "SLCOG Hypertension Guidelines 2025" \\ --vector-store-dir ./data/vector_store # Add and upload to HF python scripts/add_document.py \\ --file ./new_guideline.pdf \\ --citation "WHO Clinical Guidelines 2025" \\ --category "Obstetrics" \\ --vector-store-dir ./data/vector_store \\ --upload \\ --repo-id sniro23/VedaMD-Vector-Store """ ) parser.add_argument( "--file", type=str, required=True, help="PDF file to add" ) parser.add_argument( "--citation", type=str, help="Citation for the document" ) parser.add_argument( "--category", type=str, help="Category/specialty (e.g., Obstetrics, Cardiology)" ) parser.add_argument( "--vector-store-dir", type=str, default="./data/vector_store", help="Vector store directory" ) parser.add_argument( "--no-duplicate-check", action="store_true", help="Skip duplicate detection" ) parser.add_argument( "--upload", action="store_true", help="Upload to Hugging Face Hub after adding" ) parser.add_argument( "--repo-id", type=str, help="Hugging Face repository ID" ) parser.add_argument( "--hf-token", type=str, help="Hugging Face API token" ) args = parser.parse_args() # Get HF token hf_token = args.hf_token or os.getenv("HF_TOKEN") # Validate upload arguments if args.upload and not args.repo_id: parser.error("--repo-id is required when --upload is specified") # Add document start_time = datetime.now() adder = DocumentAdder(args.vector_store_dir) chunks_added = adder.add_document( pdf_path=args.file, citation=args.citation, category=args.category, skip_duplicates=not args.no_duplicate_check ) if chunks_added > 0: # Save updated vector store adder.save_vector_store() # Upload if requested if args.upload and args.repo_id: adder.upload_to_hf(args.repo_id, hf_token) # Summary duration = (datetime.now() - start_time).total_seconds() logger.info(f"\n{'='*60}") logger.info(f"āœ… DOCUMENT ADDED SUCCESSFULLY!") logger.info(f"{'='*60}") logger.info(f"šŸ“Š Summary:") logger.info(f" • Chunks added: {chunks_added}") logger.info(f" • Total vectors: {adder.index.ntotal}") logger.info(f" • Time taken: {duration:.2f} seconds") logger.info(f"{'='*60}\n") else: logger.warning(f"\nāš ļø No chunks were added (possibly duplicate or invalid)") if __name__ == "__main__": main()