Spaces:
Sleeping
Sleeping
Jeff's progress w/o embedding files
Browse files- src/pdf-version/__init__.py +7 -0
- src/pdf-version/data/__init__.py +15 -0
- src/pdf-version/data/loaders.py +49 -0
- src/pdf-version/data/pdf_processing.py +150 -0
- src/pdf-version/demos/__init__.py +9 -0
- src/pdf-version/demos/demo_runner.py +138 -0
- src/pdf-version/generate_embeddings.py +45 -0
- src/pdf-version/indexing/__init__.py +11 -0
- src/pdf-version/indexing/document_indexer.py +100 -0
- src/pdf-version/indexing/embedding_creator.py +108 -0
- src/pdf-version/indexing/storage.py +149 -0
- src/pdf-version/main.py +83 -0
- src/pdf-version/models/__init__.py +5 -0
- src/pdf-version/models/embedding_models.py +56 -0
- src/pdf-version/oncall_ai.py +55 -0
- src/pdf-version/rag/__init__.py +23 -0
- src/pdf-version/rag/medical_rag_pipeline.py +457 -0
- src/pdf-version/retrieval/__init__.py +17 -0
- src/pdf-version/retrieval/chunk_retriever.py +193 -0
- src/pdf-version/retrieval/document_retriever.py +192 -0
- src/pdf-version/utils/__init__.py +5 -0
- src/pdf-version/utils/helpers.py +4 -0
src/pdf-version/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OnCall AI - Medical RAG System
|
| 2 |
+
|
| 3 |
+
A sophisticated two-tier retrieval system for emergency department medical assistance.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
__version__ = "1.0.0"
|
| 7 |
+
__author__ = "OnCall AI Team"
|
src/pdf-version/data/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data loading and PDF processing."""
|
| 2 |
+
|
| 3 |
+
from .loaders import load_annotations, filter_pdf_files
|
| 4 |
+
from .pdf_processing import (
|
| 5 |
+
extract_pdf_text,
|
| 6 |
+
extract_tables_from_pdf,
|
| 7 |
+
extract_images_ocr_from_pdf,
|
| 8 |
+
extract_pdf_content_enhanced
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
'load_annotations', 'filter_pdf_files',
|
| 13 |
+
'extract_pdf_text', 'extract_tables_from_pdf',
|
| 14 |
+
'extract_images_ocr_from_pdf', 'extract_pdf_content_enhanced'
|
| 15 |
+
]
|
src/pdf-version/data/loaders.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data loading and annotation handling."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from typing import List, Dict
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def load_annotations(file_path: str = 'mapping.json') -> List[Dict]:
|
| 9 |
+
"""Load medical annotations from JSON file.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
file_path: Path to the annotations JSON file.
|
| 13 |
+
|
| 14 |
+
Returns:
|
| 15 |
+
List of annotation dictionaries.
|
| 16 |
+
"""
|
| 17 |
+
try:
|
| 18 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 19 |
+
annotations = json.load(f)
|
| 20 |
+
print(f"Loaded #{len(annotations)} annotated data")
|
| 21 |
+
|
| 22 |
+
return annotations
|
| 23 |
+
except:
|
| 24 |
+
print(f"failed to find file: {file_path}")
|
| 25 |
+
return []
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def filter_pdf_files(annotations: List[Dict], assets_dir: str = "assets") -> List[str]:
|
| 29 |
+
"""Filter and validate PDF files from annotations.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
annotations: List of annotation dictionaries.
|
| 33 |
+
assets_dir: Directory containing PDF files.
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
List of valid PDF filenames.
|
| 37 |
+
"""
|
| 38 |
+
pdf_files = []
|
| 39 |
+
|
| 40 |
+
for item in annotations:
|
| 41 |
+
filename = item['pdf']
|
| 42 |
+
filepath = os.path.join(assets_dir, filename)
|
| 43 |
+
|
| 44 |
+
if filename.endswith('.pdf') and os.path.exists(filepath):
|
| 45 |
+
pdf_files.append(filename)
|
| 46 |
+
else:
|
| 47 |
+
print(f"β οΈ Skipping non-pdf and non-existing files: {filename}")
|
| 48 |
+
|
| 49 |
+
return pdf_files
|
src/pdf-version/data/pdf_processing.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PDF content extraction and processing."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import io
|
| 5 |
+
from typing import List
|
| 6 |
+
import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
# PDF processing imports
|
| 10 |
+
import pdfplumber
|
| 11 |
+
import fitz # PyMuPDF
|
| 12 |
+
import easyocr
|
| 13 |
+
from PIL import Image
|
| 14 |
+
|
| 15 |
+
# LlamaIndex imports
|
| 16 |
+
from llama_index.core import Document, SimpleDirectoryReader
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def extract_pdf_text(pdf_path: str) -> str:
|
| 20 |
+
"""Extract plain text from PDF file.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
pdf_path: Path to the PDF file.
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
Extracted text content.
|
| 27 |
+
"""
|
| 28 |
+
text_content = ""
|
| 29 |
+
try:
|
| 30 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 31 |
+
for page in pdf.pages:
|
| 32 |
+
page_text = page.extract_text()
|
| 33 |
+
if page_text:
|
| 34 |
+
text_content += page_text + "\n"
|
| 35 |
+
return text_content
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"β PDF text extraction error {pdf_path}: {e}")
|
| 38 |
+
return ""
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def extract_tables_from_pdf(pdf_path: str) -> Document:
|
| 42 |
+
"""Extract tables from PDF and convert to markdown format.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
pdf_path: Path to the PDF file.
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
Document containing extracted table content.
|
| 49 |
+
"""
|
| 50 |
+
try:
|
| 51 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 52 |
+
all_tables = []
|
| 53 |
+
for page_num, page in enumerate(pdf.pages):
|
| 54 |
+
tables = page.extract_tables()
|
| 55 |
+
for table_num, table in enumerate(tables):
|
| 56 |
+
if table:
|
| 57 |
+
# Convert to DataFrame then markdown
|
| 58 |
+
df = pd.DataFrame(table[1:], columns=table[0])
|
| 59 |
+
table_text = f"Page{page_num+1}Table{table_num+1}:\n{df.to_markdown(index=False)}"
|
| 60 |
+
all_tables.append(table_text)
|
| 61 |
+
|
| 62 |
+
return Document(text="\n\n".join(all_tables))
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"β οΈ pdfplumber table extraction failed: {e}")
|
| 65 |
+
return Document(text="")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def extract_images_ocr_from_pdf(pdf_path: str) -> Document:
|
| 69 |
+
"""Extract text from images in PDF using OCR.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
pdf_path: Path to the PDF file.
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Document containing OCR-extracted text.
|
| 76 |
+
"""
|
| 77 |
+
try:
|
| 78 |
+
ocr_reader = easyocr.Reader(['en'], gpu=False)
|
| 79 |
+
doc = fitz.open(pdf_path)
|
| 80 |
+
|
| 81 |
+
image_texts = []
|
| 82 |
+
total_images = 0
|
| 83 |
+
|
| 84 |
+
for page_num, page in enumerate(doc):
|
| 85 |
+
images = page.get_images(full=True)
|
| 86 |
+
total_images += len(images)
|
| 87 |
+
|
| 88 |
+
for img_index, img in enumerate(images):
|
| 89 |
+
try:
|
| 90 |
+
xref = img[0]
|
| 91 |
+
base_image = doc.extract_image(xref)
|
| 92 |
+
image_bytes = base_image["image"]
|
| 93 |
+
|
| 94 |
+
# Convert to PIL image and perform OCR
|
| 95 |
+
image_pil = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 96 |
+
ocr_result = ocr_reader.readtext(np.array(image_pil), detail=0)
|
| 97 |
+
ocr_text = "\n".join(ocr_result).strip()
|
| 98 |
+
|
| 99 |
+
if ocr_text:
|
| 100 |
+
image_texts.append(f"Page {page_num+1} image {img_index+1}:\n{ocr_text}")
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
continue
|
| 104 |
+
|
| 105 |
+
doc.close()
|
| 106 |
+
|
| 107 |
+
all_ocr_text = "\n\n".join(image_texts)
|
| 108 |
+
if image_texts:
|
| 109 |
+
print(f"β
Extracted text from {len(image_texts)}/{total_images} images")
|
| 110 |
+
|
| 111 |
+
return Document(text=all_ocr_text)
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
print(f"β οΈ Image OCR extraction failed {pdf_path}: {e}")
|
| 115 |
+
return Document(text="")
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def extract_pdf_content_enhanced(pdf_path: str) -> List[Document]:
|
| 119 |
+
"""Enhanced PDF content extraction combining text, tables, and OCR.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
pdf_path: Path to the PDF file.
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
List of Document objects containing extracted content.
|
| 126 |
+
"""
|
| 127 |
+
documents = []
|
| 128 |
+
|
| 129 |
+
print(f"π Processing PDF: {os.path.basename(pdf_path)}")
|
| 130 |
+
|
| 131 |
+
# 1. Basic text extraction
|
| 132 |
+
try:
|
| 133 |
+
text_docs = SimpleDirectoryReader(input_files=[pdf_path]).load_data()
|
| 134 |
+
documents.extend(text_docs)
|
| 135 |
+
print(f"β
Extracted basic text content")
|
| 136 |
+
except Exception as e:
|
| 137 |
+
print(f"β Basic text extraction failed: {e}")
|
| 138 |
+
|
| 139 |
+
# 2. Table extraction
|
| 140 |
+
table_doc = extract_tables_from_pdf(pdf_path)
|
| 141 |
+
if table_doc.text.strip():
|
| 142 |
+
documents.append(table_doc)
|
| 143 |
+
|
| 144 |
+
# 3. Image OCR extraction
|
| 145 |
+
ocr_doc = extract_images_ocr_from_pdf(pdf_path)
|
| 146 |
+
if ocr_doc.text.strip():
|
| 147 |
+
documents.append(ocr_doc)
|
| 148 |
+
|
| 149 |
+
print(f"β
Created {len(documents)} document objects in total")
|
| 150 |
+
return documents
|
src/pdf-version/demos/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Demo and testing functions."""
|
| 2 |
+
|
| 3 |
+
from .demo_runner import (
|
| 4 |
+
build_medical_rag_system,
|
| 5 |
+
demo_rag_query,
|
| 6 |
+
demo_all_strategies
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
__all__ = ['build_medical_rag_system', 'demo_rag_query', 'demo_all_strategies']
|
src/pdf-version/demos/demo_runner.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Demo and testing functionality."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
from src.models.embedding_models import load_biomedbert_model
|
| 6 |
+
from src.data.loaders import load_annotations
|
| 7 |
+
from src.indexing.document_indexer import build_document_index
|
| 8 |
+
from src.indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
|
| 9 |
+
from src.indexing.storage import save_document_system, load_document_system
|
| 10 |
+
from src.retrieval.document_retriever import create_document_tag_mapping, find_relevant_documents
|
| 11 |
+
from src.retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def build_medical_rag_system(enable_chunk_embeddings: bool = True):
|
| 15 |
+
"""Build the complete medical RAG system with document-tag indexing."""
|
| 16 |
+
print("π₯ OnCall AI - Medical RAG System Starting")
|
| 17 |
+
print("=" * 60)
|
| 18 |
+
|
| 19 |
+
# Load model and data
|
| 20 |
+
embedding_model = load_biomedbert_model()
|
| 21 |
+
annotations = load_annotations()
|
| 22 |
+
|
| 23 |
+
if not annotations:
|
| 24 |
+
print("β Unable to load annotation data, exiting")
|
| 25 |
+
return None, None, None, None, None
|
| 26 |
+
|
| 27 |
+
# Build document index with sentence-based chunking
|
| 28 |
+
print("\nπ Building document index with sentence-based chunking...")
|
| 29 |
+
document_index = build_document_index(annotations, chunk_size=256, chunk_overlap=25)
|
| 30 |
+
|
| 31 |
+
# Create tag embeddings
|
| 32 |
+
print("\nπ Creating tag embeddings...")
|
| 33 |
+
tag_embeddings = create_tag_embeddings(embedding_model, document_index)
|
| 34 |
+
|
| 35 |
+
# Create document-tag mapping
|
| 36 |
+
print("\nπ Creating document-tag mapping...")
|
| 37 |
+
doc_tag_mapping = create_document_tag_mapping(document_index, tag_embeddings)
|
| 38 |
+
|
| 39 |
+
# Create chunk embeddings if enabled
|
| 40 |
+
chunk_embeddings = None
|
| 41 |
+
if enable_chunk_embeddings:
|
| 42 |
+
print("\nπ Creating chunk embeddings...")
|
| 43 |
+
chunk_embeddings = create_chunk_embeddings(embedding_model, document_index)
|
| 44 |
+
|
| 45 |
+
# Save the system
|
| 46 |
+
print("\nπΎ Saving document system...")
|
| 47 |
+
save_document_system(document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings)
|
| 48 |
+
|
| 49 |
+
print("\nβ
Medical RAG system built successfully!")
|
| 50 |
+
return embedding_model, document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def demo_rag_query(query: str = "chest pain and shortness of breath",
|
| 54 |
+
strategy: str = "top_p", use_chunks: bool = True, **kwargs):
|
| 55 |
+
"""Demo RAG query functionality with different selection strategies."""
|
| 56 |
+
print(f"\nπ Demo Query: '{query}' (Strategy: {strategy}, Use chunks: {use_chunks})")
|
| 57 |
+
print("=" * 60)
|
| 58 |
+
|
| 59 |
+
# Try to load existing system first
|
| 60 |
+
load_result = load_document_system()
|
| 61 |
+
if len(load_result) == 4:
|
| 62 |
+
document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings = load_result
|
| 63 |
+
else:
|
| 64 |
+
document_index, tag_embeddings, doc_tag_mapping = load_result[:3]
|
| 65 |
+
chunk_embeddings = None
|
| 66 |
+
|
| 67 |
+
if document_index is None:
|
| 68 |
+
print("π¦ No saved system found, building new one...")
|
| 69 |
+
build_result = build_medical_rag_system(enable_chunk_embeddings=use_chunks)
|
| 70 |
+
if build_result[0] is None:
|
| 71 |
+
return
|
| 72 |
+
embedding_model, document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings = build_result
|
| 73 |
+
else:
|
| 74 |
+
embedding_model = load_biomedbert_model()
|
| 75 |
+
|
| 76 |
+
# Find relevant documents using specified strategy
|
| 77 |
+
relevant_docs = find_relevant_documents(
|
| 78 |
+
query, embedding_model, tag_embeddings, doc_tag_mapping,
|
| 79 |
+
strategy=strategy, **kwargs
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
if use_chunks and chunk_embeddings:
|
| 83 |
+
# Find relevant chunks within the selected documents
|
| 84 |
+
print(f"\nπ Finding relevant chunks within selected documents...")
|
| 85 |
+
relevant_chunks = find_relevant_chunks(
|
| 86 |
+
query, embedding_model, relevant_docs, chunk_embeddings, top_chunks_per_doc=3
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Get chunks for RAG
|
| 90 |
+
rag_content = get_chunks_for_rag(relevant_chunks, max_chunks=10)
|
| 91 |
+
print(f"\nπ Ready for RAG with {len(rag_content)} chunks")
|
| 92 |
+
|
| 93 |
+
else:
|
| 94 |
+
# Get full documents for RAG
|
| 95 |
+
rag_content = get_documents_for_rag(relevant_docs, document_index)
|
| 96 |
+
print(f"\nπ Ready for RAG with {len(rag_content)} full documents")
|
| 97 |
+
|
| 98 |
+
print("Next step: Feed this content to your LLM for answer generation")
|
| 99 |
+
return rag_content
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def demo_all_strategies(query: str = "chest pain and shortness of breath"):
|
| 103 |
+
"""Demo all selection strategies for comparison."""
|
| 104 |
+
print(f"\n㪠Comparing All Selection Strategies")
|
| 105 |
+
print("=" * 80)
|
| 106 |
+
|
| 107 |
+
# Load system
|
| 108 |
+
document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings = load_document_system()
|
| 109 |
+
if document_index is None:
|
| 110 |
+
print("π¦ Building system first...")
|
| 111 |
+
embedding_model, document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings = build_medical_rag_system()
|
| 112 |
+
if document_index is None:
|
| 113 |
+
return
|
| 114 |
+
else:
|
| 115 |
+
embedding_model = load_biomedbert_model()
|
| 116 |
+
|
| 117 |
+
strategies = [
|
| 118 |
+
("top_k", {"top_k": 3}),
|
| 119 |
+
("top_p", {"top_p": 0.8, "min_similarity": 0.3}),
|
| 120 |
+
("threshold", {"similarity_threshold": 0.5})
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
results = {}
|
| 124 |
+
for strategy, params in strategies:
|
| 125 |
+
print(f"\n{'='*20} {strategy.upper()} Strategy {'='*20}")
|
| 126 |
+
relevant_docs = find_relevant_documents(
|
| 127 |
+
query, embedding_model, tag_embeddings, doc_tag_mapping,
|
| 128 |
+
strategy=strategy, **params
|
| 129 |
+
)
|
| 130 |
+
results[strategy] = relevant_docs
|
| 131 |
+
|
| 132 |
+
# Summary comparison
|
| 133 |
+
print(f"\nπ Strategy Comparison Summary:")
|
| 134 |
+
print("-" * 50)
|
| 135 |
+
for strategy, docs in results.items():
|
| 136 |
+
print(f"{strategy:>10}: {len(docs)} documents selected")
|
| 137 |
+
|
| 138 |
+
return results
|
src/pdf-version/generate_embeddings.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Quick script to generate new embeddings with sentence-based chunking
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Add src directory to Python path
|
| 10 |
+
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
| 11 |
+
|
| 12 |
+
from src.demos.demo_runner import build_medical_rag_system
|
| 13 |
+
|
| 14 |
+
def main():
|
| 15 |
+
print("π Starting to build medical RAG system with new sentence-based chunking...")
|
| 16 |
+
print("π Configuration:")
|
| 17 |
+
print(" - Chunk size: 256 tokens")
|
| 18 |
+
print(" - Chunk overlap: 25 tokens (10%)")
|
| 19 |
+
print(" - Method: SentenceSplitter")
|
| 20 |
+
print(" - Enhanced tag embeddings: β
")
|
| 21 |
+
print(" - Chunk embeddings: β
")
|
| 22 |
+
print("")
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
result = build_medical_rag_system(enable_chunk_embeddings=True)
|
| 26 |
+
|
| 27 |
+
if result[0] is not None:
|
| 28 |
+
print("β
Successfully built medical RAG system!")
|
| 29 |
+
print("π Generated files:")
|
| 30 |
+
print(" - document_index.json")
|
| 31 |
+
print(" - tag_embeddings.json")
|
| 32 |
+
print(" - document_tag_mapping.json")
|
| 33 |
+
print(" - chunk_embeddings.json")
|
| 34 |
+
else:
|
| 35 |
+
print("β Failed to build system")
|
| 36 |
+
|
| 37 |
+
except KeyboardInterrupt:
|
| 38 |
+
print("\nβ οΈ Process interrupted by user")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"β Error occurred: {e}")
|
| 41 |
+
import traceback
|
| 42 |
+
traceback.print_exc()
|
| 43 |
+
|
| 44 |
+
if __name__ == "__main__":
|
| 45 |
+
main()
|
src/pdf-version/indexing/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Document indexing and embedding generation."""
|
| 2 |
+
|
| 3 |
+
from .document_indexer import build_document_index, split_text_into_chunks
|
| 4 |
+
from .embedding_creator import create_text_embedding, create_tag_embeddings, create_chunk_embeddings
|
| 5 |
+
from .storage import save_document_system, load_document_system
|
| 6 |
+
|
| 7 |
+
__all__ = [
|
| 8 |
+
'build_document_index', 'split_text_into_chunks',
|
| 9 |
+
'create_text_embedding', 'create_tag_embeddings', 'create_chunk_embeddings',
|
| 10 |
+
'save_document_system', 'load_document_system'
|
| 11 |
+
]
|
src/pdf-version/indexing/document_indexer.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Document indexing and chunking functionality."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
from llama_index.core import Document
|
| 6 |
+
from llama_index.core.node_parser import SentenceSplitter
|
| 7 |
+
from src.data.pdf_processing import extract_pdf_content_enhanced
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def split_text_into_chunks(text: str, chunk_size: int = 256, chunk_overlap: int = 25) -> List[Dict]:
|
| 11 |
+
"""Split text into sentence-based chunks with token control.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
text: Input text to split.
|
| 15 |
+
chunk_size: Maximum size of each chunk in tokens.
|
| 16 |
+
chunk_overlap: Number of overlapping tokens between chunks.
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
List of chunk dictionaries with metadata.
|
| 20 |
+
"""
|
| 21 |
+
if not text.strip():
|
| 22 |
+
return []
|
| 23 |
+
|
| 24 |
+
# Use LlamaIndex SentenceSplitter for sentence-aware, token-based chunking
|
| 25 |
+
splitter = SentenceSplitter(
|
| 26 |
+
chunk_size=chunk_size,
|
| 27 |
+
chunk_overlap=chunk_overlap,
|
| 28 |
+
paragraph_separator="\n\n",
|
| 29 |
+
secondary_chunking_regex="[^.!?]+[.!?]" # Split on sentences
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Create a Document object for the splitter
|
| 33 |
+
document = Document(text=text)
|
| 34 |
+
|
| 35 |
+
# Split the document into nodes
|
| 36 |
+
nodes = splitter.get_nodes_from_documents([document])
|
| 37 |
+
|
| 38 |
+
# Convert nodes to our chunk format
|
| 39 |
+
chunks = []
|
| 40 |
+
for i, node in enumerate(nodes):
|
| 41 |
+
chunk_text = node.get_content()
|
| 42 |
+
if chunk_text.strip():
|
| 43 |
+
chunks.append({
|
| 44 |
+
'text': chunk_text,
|
| 45 |
+
'chunk_id': i,
|
| 46 |
+
'token_count': len(chunk_text.split()), # Approximate token count
|
| 47 |
+
'node_id': node.node_id,
|
| 48 |
+
'start_char': getattr(node, 'start_char_idx', 0),
|
| 49 |
+
'end_char': getattr(node, 'end_char_idx', len(chunk_text))
|
| 50 |
+
})
|
| 51 |
+
|
| 52 |
+
return chunks
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def build_document_index(annotations: List[Dict], assets_dir: str = "assets",
|
| 56 |
+
chunk_size: int = 256, chunk_overlap: int = 25) -> Dict:
|
| 57 |
+
"""Build a comprehensive document index with sentence-based chunked content and tags.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
annotations: List of annotation dictionaries.
|
| 61 |
+
assets_dir: Directory containing PDF files.
|
| 62 |
+
chunk_size: Maximum size of each chunk in tokens.
|
| 63 |
+
chunk_overlap: Number of overlapping tokens between chunks.
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
Dictionary containing document index with chunks and metadata.
|
| 67 |
+
"""
|
| 68 |
+
document_index = {}
|
| 69 |
+
|
| 70 |
+
for item in annotations:
|
| 71 |
+
pdf_name = item['pdf']
|
| 72 |
+
pdf_path = os.path.join(assets_dir, pdf_name)
|
| 73 |
+
|
| 74 |
+
if not os.path.exists(pdf_path):
|
| 75 |
+
print(f"β οΈ Skipping missing file: {pdf_name}")
|
| 76 |
+
continue
|
| 77 |
+
|
| 78 |
+
print(f"π Indexing document: {pdf_name}")
|
| 79 |
+
|
| 80 |
+
# Extract full document content
|
| 81 |
+
documents = extract_pdf_content_enhanced(pdf_path)
|
| 82 |
+
full_text = "\n\n".join([doc.text for doc in documents])
|
| 83 |
+
|
| 84 |
+
# Split into chunks
|
| 85 |
+
chunks = split_text_into_chunks(full_text, chunk_size, chunk_overlap)
|
| 86 |
+
|
| 87 |
+
# Build comprehensive document record
|
| 88 |
+
document_index[pdf_name] = {
|
| 89 |
+
'full_content': full_text,
|
| 90 |
+
'chunks': chunks,
|
| 91 |
+
'symptoms': item.get('symptoms', []),
|
| 92 |
+
'diagnoses': item.get('diagnoses', []),
|
| 93 |
+
'treatments': item.get('treatments', []),
|
| 94 |
+
'all_tags': item.get('symptoms', []) + item.get('diagnoses', []) + item.get('treatments', [])
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
print(f" π Split into {len(chunks)} chunks")
|
| 98 |
+
|
| 99 |
+
print(f"β
Built index for {len(document_index)} documents")
|
| 100 |
+
return document_index
|
src/pdf-version/indexing/embedding_creator.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Embedding generation for tags and document chunks."""
|
| 2 |
+
|
| 3 |
+
from typing import Dict
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def create_text_embedding(model: SentenceTransformer, text: str) -> np.ndarray:
|
| 9 |
+
"""Create embedding for a single text.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
model: SentenceTransformer model.
|
| 13 |
+
text: Input text.
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
Numpy array containing the embedding.
|
| 17 |
+
"""
|
| 18 |
+
if not text.strip():
|
| 19 |
+
return np.zeros(model.get_sentence_embedding_dimension())
|
| 20 |
+
return model.encode([text])[0]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def create_tag_embeddings(model: SentenceTransformer, document_index: Dict) -> Dict:
|
| 24 |
+
"""Create enhanced embeddings for all unique tags with medical context.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
model: SentenceTransformer model.
|
| 28 |
+
document_index: Document index dictionary.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Dictionary mapping tags to their embeddings.
|
| 32 |
+
"""
|
| 33 |
+
all_tags = set()
|
| 34 |
+
|
| 35 |
+
# Collect all unique tags
|
| 36 |
+
for doc_info in document_index.values():
|
| 37 |
+
all_tags.update(doc_info['all_tags'])
|
| 38 |
+
|
| 39 |
+
print(f"π Creating enhanced embeddings for {len(all_tags)} unique tags")
|
| 40 |
+
|
| 41 |
+
tag_embeddings = {}
|
| 42 |
+
for tag in all_tags:
|
| 43 |
+
if tag.strip():
|
| 44 |
+
# Original tag embedding
|
| 45 |
+
base_embedding = create_text_embedding(model, tag)
|
| 46 |
+
|
| 47 |
+
# Medical context variations
|
| 48 |
+
contexts = [
|
| 49 |
+
f"patient presents with {tag}",
|
| 50 |
+
f"clinical manifestation of {tag}",
|
| 51 |
+
f"emergency department patient has {tag}",
|
| 52 |
+
f"medical condition: {tag}"
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
# Generate context embeddings
|
| 56 |
+
context_embeddings = []
|
| 57 |
+
for ctx in contexts:
|
| 58 |
+
ctx_emb = create_text_embedding(model, ctx)
|
| 59 |
+
context_embeddings.append(ctx_emb)
|
| 60 |
+
|
| 61 |
+
# Combine original + context embeddings (weighted average)
|
| 62 |
+
all_embeddings = [base_embedding] + context_embeddings
|
| 63 |
+
enhanced_embedding = np.mean(all_embeddings, axis=0)
|
| 64 |
+
|
| 65 |
+
tag_embeddings[tag] = enhanced_embedding
|
| 66 |
+
|
| 67 |
+
print(f"β
Created {len(tag_embeddings)} enhanced tag embeddings with medical context")
|
| 68 |
+
return tag_embeddings
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def create_chunk_embeddings(model: SentenceTransformer, document_index: Dict) -> Dict:
|
| 72 |
+
"""Create embeddings for all document chunks.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
model: SentenceTransformer model.
|
| 76 |
+
document_index: Document index dictionary.
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
Dictionary mapping document names to their chunk embeddings.
|
| 80 |
+
"""
|
| 81 |
+
chunk_embeddings = {}
|
| 82 |
+
total_chunks = 0
|
| 83 |
+
|
| 84 |
+
print("π Creating chunk embeddings...")
|
| 85 |
+
|
| 86 |
+
for pdf_name, doc_info in document_index.items():
|
| 87 |
+
chunks = doc_info['chunks']
|
| 88 |
+
doc_chunk_embeddings = []
|
| 89 |
+
|
| 90 |
+
for chunk in chunks:
|
| 91 |
+
chunk_text = chunk['text']
|
| 92 |
+
if chunk_text.strip():
|
| 93 |
+
embedding = create_text_embedding(model, chunk_text)
|
| 94 |
+
doc_chunk_embeddings.append({
|
| 95 |
+
'chunk_id': chunk['chunk_id'],
|
| 96 |
+
'text': chunk_text,
|
| 97 |
+
'start_char': chunk.get('start_char', 0),
|
| 98 |
+
'end_char': chunk.get('end_char', len(chunk_text)),
|
| 99 |
+
'token_count': chunk.get('token_count', len(chunk_text.split())),
|
| 100 |
+
'embedding': embedding
|
| 101 |
+
})
|
| 102 |
+
|
| 103 |
+
chunk_embeddings[pdf_name] = doc_chunk_embeddings
|
| 104 |
+
total_chunks += len(doc_chunk_embeddings)
|
| 105 |
+
print(f" π {pdf_name}: {len(doc_chunk_embeddings)} chunks")
|
| 106 |
+
|
| 107 |
+
print(f"β
Created embeddings for {total_chunks} chunks across all documents")
|
| 108 |
+
return chunk_embeddings
|
src/pdf-version/indexing/storage.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data persistence for document system."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from typing import Dict, Optional, Tuple
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
| 10 |
+
doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
|
| 11 |
+
output_dir: str = "."):
|
| 12 |
+
"""Save the complete document indexing system.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
document_index: Document index dictionary.
|
| 16 |
+
tag_embeddings: Tag embeddings dictionary.
|
| 17 |
+
doc_tag_mapping: Document-tag mapping dictionary.
|
| 18 |
+
chunk_embeddings: Chunk embeddings dictionary (optional).
|
| 19 |
+
output_dir: Output directory for saved files.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
# Save document index (content + metadata + chunks)
|
| 23 |
+
doc_index_serializable = {}
|
| 24 |
+
for doc_name, doc_info in document_index.items():
|
| 25 |
+
doc_index_serializable[doc_name] = {
|
| 26 |
+
'full_content': doc_info.get('full_content', doc_info.get('content', '')),
|
| 27 |
+
'chunks': doc_info.get('chunks', []),
|
| 28 |
+
'symptoms': doc_info['symptoms'],
|
| 29 |
+
'diagnoses': doc_info['diagnoses'],
|
| 30 |
+
'treatments': doc_info.get('treatments', []),
|
| 31 |
+
'all_tags': doc_info['all_tags']
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
with open(os.path.join(output_dir, 'document_index.json'), 'w', encoding='utf-8') as f:
|
| 35 |
+
json.dump(doc_index_serializable, f, indent=2, ensure_ascii=False)
|
| 36 |
+
|
| 37 |
+
# Save tag embeddings
|
| 38 |
+
tag_embeddings_serializable = {
|
| 39 |
+
tag: embedding.tolist() for tag, embedding in tag_embeddings.items()
|
| 40 |
+
}
|
| 41 |
+
with open(os.path.join(output_dir, 'tag_embeddings.json'), 'w', encoding='utf-8') as f:
|
| 42 |
+
json.dump(tag_embeddings_serializable, f, indent=2, ensure_ascii=False)
|
| 43 |
+
|
| 44 |
+
# Save document-tag mapping
|
| 45 |
+
doc_tag_serializable = {}
|
| 46 |
+
for doc_name, doc_info in doc_tag_mapping.items():
|
| 47 |
+
doc_tag_serializable[doc_name] = {
|
| 48 |
+
'tags': doc_info['tags'],
|
| 49 |
+
'symptoms': doc_info['symptoms'],
|
| 50 |
+
'diagnoses': doc_info['diagnoses'],
|
| 51 |
+
'treatments': doc_info['treatments'],
|
| 52 |
+
'tag_embeddings': {
|
| 53 |
+
tag: embedding.tolist()
|
| 54 |
+
for tag, embedding in doc_info['tag_embeddings'].items()
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
with open(os.path.join(output_dir, 'document_tag_mapping.json'), 'w', encoding='utf-8') as f:
|
| 59 |
+
json.dump(doc_tag_serializable, f, indent=2, ensure_ascii=False)
|
| 60 |
+
|
| 61 |
+
# Save chunk embeddings if provided
|
| 62 |
+
if chunk_embeddings:
|
| 63 |
+
chunk_embeddings_serializable = {}
|
| 64 |
+
for doc_name, chunks in chunk_embeddings.items():
|
| 65 |
+
chunk_embeddings_serializable[doc_name] = []
|
| 66 |
+
for chunk in chunks:
|
| 67 |
+
chunk_embeddings_serializable[doc_name].append({
|
| 68 |
+
'chunk_id': chunk['chunk_id'],
|
| 69 |
+
'text': chunk['text'],
|
| 70 |
+
'start_char': chunk.get('start_char', 0),
|
| 71 |
+
'end_char': chunk.get('end_char', len(chunk['text'])),
|
| 72 |
+
'token_count': chunk.get('token_count', len(chunk['text'].split())),
|
| 73 |
+
'embedding': chunk['embedding'].tolist()
|
| 74 |
+
})
|
| 75 |
+
|
| 76 |
+
with open(os.path.join(output_dir, 'chunk_embeddings.json'), 'w', encoding='utf-8') as f:
|
| 77 |
+
json.dump(chunk_embeddings_serializable, f, indent=2, ensure_ascii=False)
|
| 78 |
+
|
| 79 |
+
print("β
Document system saved to files")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def load_document_system(input_dir: str = ".") -> Tuple[Optional[Dict], Optional[Dict], Optional[Dict], Optional[Dict]]:
|
| 83 |
+
"""Load the complete document indexing system.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
input_dir: Input directory containing saved files.
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
Tuple of (document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings).
|
| 90 |
+
Returns (None, None, None, None) if loading fails.
|
| 91 |
+
"""
|
| 92 |
+
try:
|
| 93 |
+
# Load document index
|
| 94 |
+
with open(os.path.join(input_dir, 'document_index.json'), 'r', encoding='utf-8') as f:
|
| 95 |
+
document_index = json.load(f)
|
| 96 |
+
|
| 97 |
+
# Load tag embeddings
|
| 98 |
+
with open(os.path.join(input_dir, 'tag_embeddings.json'), 'r', encoding='utf-8') as f:
|
| 99 |
+
tag_embeddings_data = json.load(f)
|
| 100 |
+
tag_embeddings = {
|
| 101 |
+
tag: np.array(embedding)
|
| 102 |
+
for tag, embedding in tag_embeddings_data.items()
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
# Load document-tag mapping
|
| 106 |
+
with open(os.path.join(input_dir, 'document_tag_mapping.json'), 'r', encoding='utf-8') as f:
|
| 107 |
+
doc_tag_data = json.load(f)
|
| 108 |
+
doc_tag_mapping = {}
|
| 109 |
+
for doc_name, doc_info in doc_tag_data.items():
|
| 110 |
+
doc_tag_mapping[doc_name] = {
|
| 111 |
+
'tags': doc_info['tags'],
|
| 112 |
+
'symptoms': doc_info['symptoms'],
|
| 113 |
+
'diagnoses': doc_info['diagnoses'],
|
| 114 |
+
'treatments': doc_info['treatments'],
|
| 115 |
+
'tag_embeddings': {
|
| 116 |
+
tag: np.array(embedding)
|
| 117 |
+
for tag, embedding in doc_info['tag_embeddings'].items()
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
# Try to load chunk embeddings if they exist
|
| 122 |
+
chunk_embeddings = None
|
| 123 |
+
chunk_embeddings_path = os.path.join(input_dir, 'chunk_embeddings.json')
|
| 124 |
+
if os.path.exists(chunk_embeddings_path):
|
| 125 |
+
with open(chunk_embeddings_path, 'r', encoding='utf-8') as f:
|
| 126 |
+
chunk_data = json.load(f)
|
| 127 |
+
chunk_embeddings = {}
|
| 128 |
+
for doc_name, chunks in chunk_data.items():
|
| 129 |
+
chunk_embeddings[doc_name] = []
|
| 130 |
+
for chunk in chunks:
|
| 131 |
+
chunk_embeddings[doc_name].append({
|
| 132 |
+
'chunk_id': chunk['chunk_id'],
|
| 133 |
+
'text': chunk['text'],
|
| 134 |
+
'start_char': chunk.get('start_char', 0),
|
| 135 |
+
'end_char': chunk.get('end_char', len(chunk['text'])),
|
| 136 |
+
'token_count': chunk.get('token_count', len(chunk['text'].split())),
|
| 137 |
+
# Backward compatibility for old format
|
| 138 |
+
'start_word': chunk.get('start_word', 0),
|
| 139 |
+
'end_word': chunk.get('end_word', len(chunk['text'].split())),
|
| 140 |
+
'embedding': np.array(chunk['embedding'])
|
| 141 |
+
})
|
| 142 |
+
print("β
Chunk embeddings loaded")
|
| 143 |
+
|
| 144 |
+
print("β
Document system loaded successfully")
|
| 145 |
+
return document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"β Failed to load document system: {e}")
|
| 149 |
+
return None, None, None, None
|
src/pdf-version/main.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""OnCall AI - Medical RAG System
|
| 3 |
+
|
| 4 |
+
Main entry point for the medical RAG system.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add src directory to Python path
|
| 11 |
+
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
| 12 |
+
|
| 13 |
+
from src.demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def main():
|
| 17 |
+
"""Main program entry point."""
|
| 18 |
+
try:
|
| 19 |
+
# Build the system with chunk embeddings
|
| 20 |
+
build_medical_rag_system(enable_chunk_embeddings=True)
|
| 21 |
+
|
| 22 |
+
# Demo chunk-based retrieval
|
| 23 |
+
print("\n" + "="*80)
|
| 24 |
+
print("π§© CHUNK-BASED RETRIEVAL DEMO")
|
| 25 |
+
print("="*80)
|
| 26 |
+
demo_rag_query("chest pain and shortness of breath",
|
| 27 |
+
strategy="top_p", use_chunks=True, top_p=0.8)
|
| 28 |
+
|
| 29 |
+
except KeyboardInterrupt:
|
| 30 |
+
print("\n\nπ User interrupted, program exiting")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"\nβ Program execution error: {e}")
|
| 33 |
+
import traceback
|
| 34 |
+
traceback.print_exc()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def interactive_demo():
|
| 38 |
+
"""Interactive demo mode."""
|
| 39 |
+
print("π₯ OnCall AI - Interactive Demo Mode")
|
| 40 |
+
print("=" * 50)
|
| 41 |
+
|
| 42 |
+
while True:
|
| 43 |
+
print("\nOptions:")
|
| 44 |
+
print("1. Build/rebuild system")
|
| 45 |
+
print("2. Query with TOP-P strategy")
|
| 46 |
+
print("3. Query with TOP-K strategy")
|
| 47 |
+
print("4. Compare all strategies")
|
| 48 |
+
print("5. Custom query")
|
| 49 |
+
print("6. Exit")
|
| 50 |
+
|
| 51 |
+
choice = input("\nSelect option (1-6): ").strip()
|
| 52 |
+
|
| 53 |
+
if choice == "1":
|
| 54 |
+
build_medical_rag_system(enable_chunk_embeddings=True)
|
| 55 |
+
elif choice == "2":
|
| 56 |
+
query = input("Enter your query: ").strip()
|
| 57 |
+
if query:
|
| 58 |
+
demo_rag_query(query, strategy="top_p", use_chunks=True)
|
| 59 |
+
elif choice == "3":
|
| 60 |
+
query = input("Enter your query: ").strip()
|
| 61 |
+
if query:
|
| 62 |
+
demo_rag_query(query, strategy="top_k", use_chunks=True, top_k=3)
|
| 63 |
+
elif choice == "4":
|
| 64 |
+
query = input("Enter your query: ").strip()
|
| 65 |
+
if query:
|
| 66 |
+
demo_all_strategies(query)
|
| 67 |
+
elif choice == "5":
|
| 68 |
+
query = input("Enter your query: ").strip()
|
| 69 |
+
strategy = input("Enter strategy (top_k/top_p/threshold): ").strip()
|
| 70 |
+
if query and strategy:
|
| 71 |
+
demo_rag_query(query, strategy=strategy, use_chunks=True)
|
| 72 |
+
elif choice == "6":
|
| 73 |
+
print("π Goodbye!")
|
| 74 |
+
break
|
| 75 |
+
else:
|
| 76 |
+
print("β Invalid option. Please select 1-6.")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
if __name__ == "__main__":
|
| 80 |
+
if len(sys.argv) > 1 and sys.argv[1] == "--interactive":
|
| 81 |
+
interactive_demo()
|
| 82 |
+
else:
|
| 83 |
+
main()
|
src/pdf-version/models/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Model loading and management."""
|
| 2 |
+
|
| 3 |
+
from .embedding_models import load_biomedbert_model, load_meditron_model
|
| 4 |
+
|
| 5 |
+
__all__ = ['load_biomedbert_model', 'load_meditron_model']
|
src/pdf-version/models/embedding_models.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Embedding model loading and management."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional
|
| 4 |
+
import torch
|
| 5 |
+
from sentence_transformers import SentenceTransformer, models
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def load_biomedbert_model(device: Optional[str] = None) -> SentenceTransformer:
|
| 9 |
+
"""Load BGE Large Medical model optimized for medical domain embeddings.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
device: Device to use ('cuda', 'mps', 'cpu'). Auto-detects if None.
|
| 13 |
+
|
| 14 |
+
Returns:
|
| 15 |
+
Loaded SentenceTransformer model.
|
| 16 |
+
"""
|
| 17 |
+
if device is None:
|
| 18 |
+
if torch.cuda.is_available():
|
| 19 |
+
device = "cuda"
|
| 20 |
+
elif torch.backends.mps.is_available(): # Apple Silicon GPU
|
| 21 |
+
device = "mps"
|
| 22 |
+
else:
|
| 23 |
+
device = "cpu"
|
| 24 |
+
|
| 25 |
+
print(f"Using device: {device}")
|
| 26 |
+
|
| 27 |
+
# Use BGE Large Medical which is optimized for medical domain
|
| 28 |
+
try:
|
| 29 |
+
model = SentenceTransformer('ls-da3m0ns/bge_large_medical')
|
| 30 |
+
model = model.to(device)
|
| 31 |
+
print("β
Loaded BGE Large Medical model for medical embeddings")
|
| 32 |
+
return model
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"β Failed to load BGE Large Medical: {e}")
|
| 35 |
+
print("π Falling back to manual construction...")
|
| 36 |
+
|
| 37 |
+
# Fallback to manual construction if direct loading fails
|
| 38 |
+
word_embedding_model = models.Transformer('ls-da3m0ns/bge_large_medical')
|
| 39 |
+
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
|
| 40 |
+
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
|
| 41 |
+
model = model.to(device)
|
| 42 |
+
return model
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def load_meditron_model():
|
| 46 |
+
"""Load Meditron-7B model (placeholder for future implementation).
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
None (not implemented yet).
|
| 50 |
+
"""
|
| 51 |
+
# TODO: Implement Meditron-7B loading
|
| 52 |
+
# from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 53 |
+
# tokenizer = AutoTokenizer.from_pretrained("epfl-llm/meditron-7b")
|
| 54 |
+
# model = AutoModelForCausalLM.from_pretrained("epfl-llm/meditron-7b")
|
| 55 |
+
print("Meditron-7B to be implemented")
|
| 56 |
+
return None
|
src/pdf-version/oncall_ai.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""OnCall AI - Medical RAG System (Backward Compatibility)
|
| 3 |
+
|
| 4 |
+
This file provides backward compatibility with the original rag.py interface.
|
| 5 |
+
Import everything from the new modular structure.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Add src directory to Python path
|
| 12 |
+
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
| 13 |
+
|
| 14 |
+
# Import all functions for backward compatibility
|
| 15 |
+
from src.models.embedding_models import load_biomedbert_model, load_meditron_model
|
| 16 |
+
from src.data.loaders import load_annotations, filter_pdf_files
|
| 17 |
+
from src.data.pdf_processing import (
|
| 18 |
+
extract_pdf_text, extract_tables_from_pdf,
|
| 19 |
+
extract_images_ocr_from_pdf, extract_pdf_content_enhanced
|
| 20 |
+
)
|
| 21 |
+
from src.indexing.document_indexer import build_document_index, split_text_into_chunks
|
| 22 |
+
from src.indexing.embedding_creator import create_text_embedding, create_tag_embeddings, create_chunk_embeddings
|
| 23 |
+
from src.indexing.storage import save_document_system, load_document_system
|
| 24 |
+
from src.retrieval.document_retriever import (
|
| 25 |
+
find_relevant_documents_top_k, find_relevant_documents_top_p,
|
| 26 |
+
find_relevant_documents_threshold, find_relevant_documents,
|
| 27 |
+
create_document_tag_mapping
|
| 28 |
+
)
|
| 29 |
+
from src.retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
|
| 30 |
+
from src.demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
|
| 31 |
+
|
| 32 |
+
# Main function for backward compatibility
|
| 33 |
+
def main():
|
| 34 |
+
"""Main program entry compatible with original rag.py."""
|
| 35 |
+
try:
|
| 36 |
+
# Build the system with chunk embeddings
|
| 37 |
+
build_medical_rag_system(enable_chunk_embeddings=True)
|
| 38 |
+
|
| 39 |
+
# Demo chunk-based retrieval
|
| 40 |
+
print("\n" + "="*80)
|
| 41 |
+
print("π§© CHUNK-BASED RETRIEVAL DEMO")
|
| 42 |
+
print("="*80)
|
| 43 |
+
demo_rag_query("chest pain and shortness of breath",
|
| 44 |
+
strategy="top_p", use_chunks=True, top_p=0.8)
|
| 45 |
+
|
| 46 |
+
except KeyboardInterrupt:
|
| 47 |
+
print("\n\nπ User interrupted, program exiting")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"\nβ Program execution error: {e}")
|
| 50 |
+
import traceback
|
| 51 |
+
traceback.print_exc()
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
if __name__ == "__main__":
|
| 55 |
+
main()
|
src/pdf-version/rag/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Medical RAG Pipeline module (Functional Programming)."""
|
| 2 |
+
|
| 3 |
+
from .medical_rag_pipeline import (
|
| 4 |
+
generate_with_ollama,
|
| 5 |
+
retrieve_medical_context,
|
| 6 |
+
evaluate_context_quality,
|
| 7 |
+
create_medical_prompt,
|
| 8 |
+
generate_medical_response,
|
| 9 |
+
answer_medical_query,
|
| 10 |
+
load_rag_data,
|
| 11 |
+
quick_medical_query
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
__all__ = [
|
| 15 |
+
'generate_with_ollama',
|
| 16 |
+
'retrieve_medical_context',
|
| 17 |
+
'evaluate_context_quality',
|
| 18 |
+
'create_medical_prompt',
|
| 19 |
+
'generate_medical_response',
|
| 20 |
+
'answer_medical_query',
|
| 21 |
+
'load_rag_data',
|
| 22 |
+
'quick_medical_query'
|
| 23 |
+
]
|
src/pdf-version/rag/medical_rag_pipeline.py
ADDED
|
@@ -0,0 +1,457 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Complete Medical RAG Pipeline integrating retrieval system with Meditron-7B (Functional Programming)."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import requests
|
| 5 |
+
import numpy as np
|
| 6 |
+
from typing import Dict, List, Optional, Tuple
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
+
|
| 9 |
+
# Import existing retrieval components
|
| 10 |
+
import sys
|
| 11 |
+
import os
|
| 12 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 13 |
+
|
| 14 |
+
from retrieval.document_retriever import find_relevant_documents
|
| 15 |
+
from retrieval.chunk_retriever import find_relevant_chunks, get_chunks_for_rag
|
| 16 |
+
from models.embedding_models import load_biomedbert_model
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def generate_with_ollama(prompt: str,
|
| 20 |
+
model: str = "meditron:7b",
|
| 21 |
+
base_url: str = "http://localhost:11434",
|
| 22 |
+
temperature: float = 0.1,
|
| 23 |
+
max_tokens: int = 300) -> Dict:
|
| 24 |
+
"""Generate response using Ollama model.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
prompt: Input prompt for the model
|
| 28 |
+
model: Ollama model name
|
| 29 |
+
base_url: Ollama server URL
|
| 30 |
+
temperature: Sampling temperature
|
| 31 |
+
max_tokens: Maximum tokens to generate
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Dictionary with response or error
|
| 35 |
+
"""
|
| 36 |
+
url = f"{base_url}/api/generate"
|
| 37 |
+
|
| 38 |
+
payload = {
|
| 39 |
+
"model": model,
|
| 40 |
+
"prompt": prompt,
|
| 41 |
+
"stream": False,
|
| 42 |
+
"options": {
|
| 43 |
+
"temperature": temperature,
|
| 44 |
+
"num_predict": max_tokens,
|
| 45 |
+
"top_p": 0.9,
|
| 46 |
+
"top_k": 40
|
| 47 |
+
}
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
response = requests.post(url, json=payload, timeout=120)
|
| 52 |
+
response.raise_for_status()
|
| 53 |
+
return response.json()
|
| 54 |
+
except requests.exceptions.RequestException as e:
|
| 55 |
+
return {"error": f"LLM request failed: {str(e)}"}
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def retrieve_medical_context(query: str,
|
| 59 |
+
embedding_model: SentenceTransformer,
|
| 60 |
+
tag_embeddings: Dict,
|
| 61 |
+
chunk_embeddings: Dict,
|
| 62 |
+
doc_tag_mapping: Dict,
|
| 63 |
+
doc_strategy: str = "top_p",
|
| 64 |
+
chunk_strategy: str = "top_p",
|
| 65 |
+
max_chunks: int = 5) -> Dict:
|
| 66 |
+
"""
|
| 67 |
+
Retrieve relevant medical context for query using two-stage retrieval.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
query: Medical question/query
|
| 71 |
+
embedding_model: BGE Large Medical model
|
| 72 |
+
tag_embeddings: Pre-computed tag embeddings
|
| 73 |
+
chunk_embeddings: Pre-computed chunk embeddings
|
| 74 |
+
doc_tag_mapping: Document to tag mapping
|
| 75 |
+
doc_strategy: Document retrieval strategy
|
| 76 |
+
chunk_strategy: Chunk retrieval strategy
|
| 77 |
+
max_chunks: Maximum chunks to retrieve
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Dictionary with retrieval results and metadata
|
| 81 |
+
"""
|
| 82 |
+
print(f"π Retrieving context for: '{query}'")
|
| 83 |
+
|
| 84 |
+
# Stage 1: Document-level retrieval
|
| 85 |
+
print("π Stage 1: Document retrieval...")
|
| 86 |
+
relevant_docs = find_relevant_documents(
|
| 87 |
+
query, embedding_model, tag_embeddings, doc_tag_mapping,
|
| 88 |
+
strategy=doc_strategy, top_p=0.6, min_similarity=0.5
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
if not relevant_docs:
|
| 92 |
+
print("β οΈ No relevant documents found")
|
| 93 |
+
return {
|
| 94 |
+
"has_context": False,
|
| 95 |
+
"relevant_documents": [],
|
| 96 |
+
"relevant_chunks": [],
|
| 97 |
+
"rag_chunks": [],
|
| 98 |
+
"context_text": "",
|
| 99 |
+
"retrieval_metadata": {
|
| 100 |
+
"total_docs": 0,
|
| 101 |
+
"total_chunks": 0,
|
| 102 |
+
"context_length": 0
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
# Stage 2: Chunk-level retrieval
|
| 107 |
+
print("π Stage 2: Chunk retrieval...")
|
| 108 |
+
relevant_chunks = find_relevant_chunks(
|
| 109 |
+
query, embedding_model, relevant_docs, chunk_embeddings,
|
| 110 |
+
strategy=chunk_strategy, top_p=0.6, min_similarity=0.3,
|
| 111 |
+
similarity_metric="dot_product"
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
if not relevant_chunks:
|
| 115 |
+
print("β οΈ No relevant chunks found")
|
| 116 |
+
return {
|
| 117 |
+
"has_context": False,
|
| 118 |
+
"relevant_documents": relevant_docs,
|
| 119 |
+
"relevant_chunks": [],
|
| 120 |
+
"rag_chunks": [],
|
| 121 |
+
"context_text": "",
|
| 122 |
+
"retrieval_metadata": {
|
| 123 |
+
"total_docs": len(relevant_docs),
|
| 124 |
+
"total_chunks": 0,
|
| 125 |
+
"context_length": 0
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
# Stage 3: Prepare RAG context
|
| 130 |
+
print("π― Stage 3: Preparing RAG context...")
|
| 131 |
+
rag_chunks = get_chunks_for_rag(relevant_chunks, max_chunks)
|
| 132 |
+
context_text = "\n\n".join(rag_chunks)
|
| 133 |
+
|
| 134 |
+
# Calculate retrieval statistics
|
| 135 |
+
avg_similarity = np.mean([chunk['similarity'] for chunk in relevant_chunks])
|
| 136 |
+
max_similarity = max([chunk['similarity'] for chunk in relevant_chunks])
|
| 137 |
+
|
| 138 |
+
print(f"β
Context prepared: {len(rag_chunks)} chunks, avg_sim={avg_similarity:.3f}")
|
| 139 |
+
|
| 140 |
+
return {
|
| 141 |
+
"has_context": True,
|
| 142 |
+
"relevant_documents": relevant_docs,
|
| 143 |
+
"relevant_chunks": relevant_chunks,
|
| 144 |
+
"rag_chunks": rag_chunks,
|
| 145 |
+
"context_text": context_text,
|
| 146 |
+
"retrieval_metadata": {
|
| 147 |
+
"total_docs": len(relevant_docs),
|
| 148 |
+
"total_chunks": len(relevant_chunks),
|
| 149 |
+
"chunks_for_rag": len(rag_chunks),
|
| 150 |
+
"context_length": len(context_text),
|
| 151 |
+
"avg_similarity": float(avg_similarity),
|
| 152 |
+
"max_similarity": float(max_similarity)
|
| 153 |
+
}
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def evaluate_context_quality(context_result: Dict, query: str) -> Dict:
|
| 158 |
+
"""
|
| 159 |
+
Evaluate if retrieved context is sufficient to answer the query.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
context_result: Result from retrieve_medical_context()
|
| 163 |
+
query: Original query
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
Quality assessment dictionary
|
| 167 |
+
"""
|
| 168 |
+
if not context_result["has_context"]:
|
| 169 |
+
return {
|
| 170 |
+
"is_sufficient": False,
|
| 171 |
+
"confidence": 0.0,
|
| 172 |
+
"reason": "No relevant medical documents found in database"
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
metadata = context_result["retrieval_metadata"]
|
| 176 |
+
|
| 177 |
+
# Quality heuristics
|
| 178 |
+
min_similarity_threshold = 0.4
|
| 179 |
+
min_chunks_threshold = 2
|
| 180 |
+
min_context_length = 200
|
| 181 |
+
|
| 182 |
+
# Check similarity scores
|
| 183 |
+
avg_sim = metadata["avg_similarity"]
|
| 184 |
+
max_sim = metadata["max_similarity"]
|
| 185 |
+
|
| 186 |
+
# Check quantity
|
| 187 |
+
chunk_count = metadata["chunks_for_rag"]
|
| 188 |
+
context_length = metadata["context_length"]
|
| 189 |
+
|
| 190 |
+
# Determine if context is sufficient
|
| 191 |
+
quality_checks = {
|
| 192 |
+
"high_similarity": max_sim >= min_similarity_threshold,
|
| 193 |
+
"sufficient_chunks": chunk_count >= min_chunks_threshold,
|
| 194 |
+
"sufficient_length": context_length >= min_context_length,
|
| 195 |
+
"decent_avg_similarity": avg_sim >= 0.3
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
passed_checks = sum(quality_checks.values())
|
| 199 |
+
confidence = passed_checks / len(quality_checks)
|
| 200 |
+
|
| 201 |
+
is_sufficient = passed_checks >= 3 # At least 3/4 checks must pass
|
| 202 |
+
|
| 203 |
+
if not is_sufficient:
|
| 204 |
+
if not quality_checks["high_similarity"]:
|
| 205 |
+
reason = f"Low similarity to medical documents (max: {max_sim:.3f})"
|
| 206 |
+
elif not quality_checks["sufficient_chunks"]:
|
| 207 |
+
reason = f"Insufficient relevant content found ({chunk_count} chunks)"
|
| 208 |
+
else:
|
| 209 |
+
reason = "Retrieved content may not adequately address the query"
|
| 210 |
+
else:
|
| 211 |
+
reason = "Context appears sufficient for medical response"
|
| 212 |
+
|
| 213 |
+
return {
|
| 214 |
+
"is_sufficient": is_sufficient,
|
| 215 |
+
"confidence": confidence,
|
| 216 |
+
"reason": reason,
|
| 217 |
+
"quality_checks": quality_checks,
|
| 218 |
+
"similarity_stats": {
|
| 219 |
+
"avg_similarity": avg_sim,
|
| 220 |
+
"max_similarity": max_sim
|
| 221 |
+
}
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def create_medical_prompt(query: str, context: str, context_quality: Dict) -> str:
|
| 226 |
+
"""
|
| 227 |
+
Create a medical prompt with proper instructions and context.
|
| 228 |
+
|
| 229 |
+
Args:
|
| 230 |
+
query: User's medical question
|
| 231 |
+
context: Retrieved medical context
|
| 232 |
+
context_quality: Context quality assessment
|
| 233 |
+
|
| 234 |
+
Returns:
|
| 235 |
+
Formatted prompt for medical LLM
|
| 236 |
+
"""
|
| 237 |
+
# Base medical prompt with professional identity
|
| 238 |
+
base_prompt = """You are a medical AI assistant. Your role is to provide accurate medical information based strictly on the provided medical literature context.
|
| 239 |
+
|
| 240 |
+
IMPORTANT GUIDELINES:
|
| 241 |
+
1. Base your answers ONLY on the provided medical context
|
| 242 |
+
2. If the context doesn't contain sufficient information to answer the question, clearly state: "Based on the available medical literature in my database, I cannot provide a complete answer to this question."
|
| 243 |
+
3. Always cite that your response is "based on the provided medical literature"
|
| 244 |
+
4. Do not make assumptions or provide information not present in the context
|
| 245 |
+
5. For serious medical conditions, always recommend consulting healthcare professionals
|
| 246 |
+
6. Be precise and use appropriate medical terminology
|
| 247 |
+
|
| 248 |
+
"""
|
| 249 |
+
|
| 250 |
+
if context_quality["is_sufficient"]:
|
| 251 |
+
# High-confidence response with context
|
| 252 |
+
prompt = f"""{base_prompt}
|
| 253 |
+
|
| 254 |
+
MEDICAL LITERATURE CONTEXT:
|
| 255 |
+
{context}
|
| 256 |
+
|
| 257 |
+
QUESTION: {query}
|
| 258 |
+
|
| 259 |
+
MEDICAL RESPONSE (based on the provided medical literature):"""
|
| 260 |
+
|
| 261 |
+
else:
|
| 262 |
+
# Low-confidence response with limited context
|
| 263 |
+
prompt = f"""{base_prompt}
|
| 264 |
+
|
| 265 |
+
LIMITED MEDICAL CONTEXT AVAILABLE:
|
| 266 |
+
{context if context else "No directly relevant medical literature found."}
|
| 267 |
+
|
| 268 |
+
QUESTION: {query}
|
| 269 |
+
|
| 270 |
+
MEDICAL RESPONSE: Based on the available medical literature in my database, I have limited information to fully address this question. {context_quality["reason"]}
|
| 271 |
+
|
| 272 |
+
However, here is what I can provide based on the available context:"""
|
| 273 |
+
|
| 274 |
+
return prompt
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def generate_medical_response(prompt: str, model: str = "meditron:7b") -> Dict:
|
| 278 |
+
"""
|
| 279 |
+
Generate medical response using Meditron-7B.
|
| 280 |
+
|
| 281 |
+
Args:
|
| 282 |
+
prompt: Formatted medical prompt
|
| 283 |
+
model: Ollama model name
|
| 284 |
+
|
| 285 |
+
Returns:
|
| 286 |
+
LLM response dictionary
|
| 287 |
+
"""
|
| 288 |
+
print("π§ Generating medical response...")
|
| 289 |
+
|
| 290 |
+
# Use low temperature for medical accuracy
|
| 291 |
+
result = generate_with_ollama(
|
| 292 |
+
prompt,
|
| 293 |
+
model=model,
|
| 294 |
+
temperature=0.1, # Very low for medical precision
|
| 295 |
+
max_tokens=400
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
if "error" in result:
|
| 299 |
+
return {
|
| 300 |
+
"success": False,
|
| 301 |
+
"response": "I apologize, but I'm currently unable to process medical queries due to a technical issue. Please consult a healthcare professional for medical advice.",
|
| 302 |
+
"error": result["error"]
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
response_text = result.get("response", "").strip()
|
| 306 |
+
|
| 307 |
+
# Basic response validation
|
| 308 |
+
if len(response_text) < 20:
|
| 309 |
+
return {
|
| 310 |
+
"success": False,
|
| 311 |
+
"response": "I was unable to generate a meaningful response. Please rephrase your medical question or consult a healthcare professional.",
|
| 312 |
+
"error": "Generated response too short"
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
return {
|
| 316 |
+
"success": True,
|
| 317 |
+
"response": response_text,
|
| 318 |
+
"generation_metadata": {
|
| 319 |
+
"model": model,
|
| 320 |
+
"response_length": len(response_text)
|
| 321 |
+
}
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def answer_medical_query(query: str,
|
| 326 |
+
embedding_model: SentenceTransformer,
|
| 327 |
+
tag_embeddings: Dict,
|
| 328 |
+
chunk_embeddings: Dict,
|
| 329 |
+
doc_tag_mapping: Dict,
|
| 330 |
+
document_index: Dict,
|
| 331 |
+
model: str = "meditron:7b",
|
| 332 |
+
**kwargs) -> Dict:
|
| 333 |
+
"""
|
| 334 |
+
Complete medical RAG pipeline: retrieve context and generate answer.
|
| 335 |
+
|
| 336 |
+
Args:
|
| 337 |
+
query: Medical question
|
| 338 |
+
embedding_model: BGE Large Medical model
|
| 339 |
+
tag_embeddings: Pre-computed tag embeddings
|
| 340 |
+
chunk_embeddings: Pre-computed chunk embeddings
|
| 341 |
+
doc_tag_mapping: Document to tag mapping
|
| 342 |
+
document_index: Complete document index
|
| 343 |
+
model: Ollama model name
|
| 344 |
+
**kwargs: Additional parameters for retrieval and generation
|
| 345 |
+
|
| 346 |
+
Returns:
|
| 347 |
+
Complete response dictionary with metadata
|
| 348 |
+
"""
|
| 349 |
+
print("\n" + "="*60)
|
| 350 |
+
print(f"π₯ Medical RAG Query: '{query}'")
|
| 351 |
+
print("="*60)
|
| 352 |
+
|
| 353 |
+
# Step 1: Retrieve medical context
|
| 354 |
+
context_result = retrieve_medical_context(
|
| 355 |
+
query, embedding_model, tag_embeddings, chunk_embeddings,
|
| 356 |
+
doc_tag_mapping, **kwargs
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
# Step 2: Evaluate context quality
|
| 360 |
+
context_quality = evaluate_context_quality(context_result, query)
|
| 361 |
+
|
| 362 |
+
print(f"π Context Quality: {context_quality['confidence']:.1%} confidence")
|
| 363 |
+
print(f"π Assessment: {context_quality['reason']}")
|
| 364 |
+
|
| 365 |
+
# Step 3: Create medical prompt
|
| 366 |
+
medical_prompt = create_medical_prompt(
|
| 367 |
+
query, context_result["context_text"], context_quality
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
# Step 4: Generate medical response
|
| 371 |
+
response_result = generate_medical_response(medical_prompt, model)
|
| 372 |
+
|
| 373 |
+
# Step 5: Compile complete result
|
| 374 |
+
complete_result = {
|
| 375 |
+
"query": query,
|
| 376 |
+
"answer": response_result["response"],
|
| 377 |
+
"success": response_result["success"],
|
| 378 |
+
"context_quality": context_quality,
|
| 379 |
+
"retrieval_metadata": context_result["retrieval_metadata"],
|
| 380 |
+
"sources": {
|
| 381 |
+
"documents": context_result["relevant_documents"],
|
| 382 |
+
"chunk_count": len(context_result["rag_chunks"])
|
| 383 |
+
}
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
# Add error information if present
|
| 387 |
+
if "error" in response_result:
|
| 388 |
+
complete_result["error"] = response_result["error"]
|
| 389 |
+
|
| 390 |
+
print(f"\nβ
Response generated successfully: {len(response_result['response'])} characters")
|
| 391 |
+
return complete_result
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
def load_rag_data(tag_embeddings_path: str = "tag_embeddings.json",
|
| 395 |
+
chunk_embeddings_path: str = "chunk_embeddings.json",
|
| 396 |
+
doc_tag_mapping_path: str = "document_tag_mapping.json",
|
| 397 |
+
document_index_path: str = "document_index.json") -> Tuple[SentenceTransformer, Dict, Dict, Dict, Dict]:
|
| 398 |
+
"""
|
| 399 |
+
Load all RAG data needed for medical question answering.
|
| 400 |
+
|
| 401 |
+
Args:
|
| 402 |
+
tag_embeddings_path: Path to tag embeddings
|
| 403 |
+
chunk_embeddings_path: Path to chunk embeddings
|
| 404 |
+
doc_tag_mapping_path: Path to document tag mapping
|
| 405 |
+
document_index_path: Path to document index
|
| 406 |
+
|
| 407 |
+
Returns:
|
| 408 |
+
Tuple of (embedding_model, tag_embeddings, chunk_embeddings, doc_tag_mapping, document_index)
|
| 409 |
+
"""
|
| 410 |
+
print("π Loading Medical RAG Data...")
|
| 411 |
+
|
| 412 |
+
# Load embedding model
|
| 413 |
+
print("π¦ Loading BGE Large Medical embedding model...")
|
| 414 |
+
embedding_model = load_biomedbert_model()
|
| 415 |
+
|
| 416 |
+
# Load embeddings and indices
|
| 417 |
+
print("π Loading embeddings and indices...")
|
| 418 |
+
|
| 419 |
+
with open(tag_embeddings_path, 'r') as f:
|
| 420 |
+
tag_embeddings = json.load(f)
|
| 421 |
+
tag_embeddings = {tag: np.array(embedding) for tag, embedding in tag_embeddings.items()}
|
| 422 |
+
|
| 423 |
+
with open(chunk_embeddings_path, 'r') as f:
|
| 424 |
+
chunk_embeddings = json.load(f)
|
| 425 |
+
for doc_name, chunks in chunk_embeddings.items():
|
| 426 |
+
for chunk in chunks:
|
| 427 |
+
chunk['embedding'] = np.array(chunk['embedding'])
|
| 428 |
+
|
| 429 |
+
with open(doc_tag_mapping_path, 'r') as f:
|
| 430 |
+
doc_tag_mapping = json.load(f)
|
| 431 |
+
|
| 432 |
+
with open(document_index_path, 'r') as f:
|
| 433 |
+
document_index = json.load(f)
|
| 434 |
+
|
| 435 |
+
print("β
Medical RAG data loaded successfully!")
|
| 436 |
+
return embedding_model, tag_embeddings, chunk_embeddings, doc_tag_mapping, document_index
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
def quick_medical_query(query: str, max_chunks: int = 3) -> Dict:
|
| 440 |
+
"""
|
| 441 |
+
Quick medical query with default settings.
|
| 442 |
+
|
| 443 |
+
Args:
|
| 444 |
+
query: Medical question
|
| 445 |
+
max_chunks: Maximum chunks to retrieve
|
| 446 |
+
|
| 447 |
+
Returns:
|
| 448 |
+
Medical response dictionary
|
| 449 |
+
"""
|
| 450 |
+
# Load data
|
| 451 |
+
embedding_model, tag_embeddings, chunk_embeddings, doc_tag_mapping, document_index = load_rag_data()
|
| 452 |
+
|
| 453 |
+
# Answer query
|
| 454 |
+
return answer_medical_query(
|
| 455 |
+
query, embedding_model, tag_embeddings, chunk_embeddings,
|
| 456 |
+
doc_tag_mapping, document_index, max_chunks=max_chunks
|
| 457 |
+
)
|
src/pdf-version/retrieval/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Retrieval systems for documents and chunks."""
|
| 2 |
+
|
| 3 |
+
from .document_retriever import (
|
| 4 |
+
find_relevant_documents_top_k,
|
| 5 |
+
find_relevant_documents_top_p,
|
| 6 |
+
find_relevant_documents_threshold,
|
| 7 |
+
find_relevant_documents,
|
| 8 |
+
create_document_tag_mapping
|
| 9 |
+
)
|
| 10 |
+
from .chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
'find_relevant_documents_top_k', 'find_relevant_documents_top_p',
|
| 14 |
+
'find_relevant_documents_threshold', 'find_relevant_documents',
|
| 15 |
+
'create_document_tag_mapping', 'find_relevant_chunks',
|
| 16 |
+
'get_documents_for_rag', 'get_chunks_for_rag'
|
| 17 |
+
]
|
src/pdf-version/retrieval/chunk_retriever.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Chunk-level retrieval functionality."""
|
| 2 |
+
|
| 3 |
+
from typing import List, Dict, Callable
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
from src.indexing.embedding_creator import create_text_embedding
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
|
| 10 |
+
"""Calculate cosine similarity between two vectors."""
|
| 11 |
+
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def dot_product_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
|
| 15 |
+
"""Calculate dot product similarity for normalized vectors."""
|
| 16 |
+
# For normalized vectors (like BGE embeddings), dot product = cosine similarity
|
| 17 |
+
# This is computationally more efficient than cosine similarity
|
| 18 |
+
return np.dot(vec1, vec2)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Similarity function registry
|
| 22 |
+
SIMILARITY_FUNCTIONS = {
|
| 23 |
+
"cosine": cosine_similarity,
|
| 24 |
+
"dot_product": dot_product_similarity
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def find_relevant_chunks_top_k(query: str, model: SentenceTransformer,
|
| 29 |
+
relevant_docs: List[str], chunk_embeddings: Dict,
|
| 30 |
+
top_chunks_per_doc: int = 3,
|
| 31 |
+
similarity_metric: str = "cosine") -> List[Dict]:
|
| 32 |
+
"""Find most relevant chunks using Top-K strategy (original method)."""
|
| 33 |
+
query_embedding = create_text_embedding(model, query)
|
| 34 |
+
|
| 35 |
+
all_relevant_chunks = []
|
| 36 |
+
|
| 37 |
+
for doc_name in relevant_docs:
|
| 38 |
+
if doc_name not in chunk_embeddings:
|
| 39 |
+
continue
|
| 40 |
+
|
| 41 |
+
doc_chunks = chunk_embeddings[doc_name]
|
| 42 |
+
chunk_similarities = []
|
| 43 |
+
|
| 44 |
+
# Get similarity function
|
| 45 |
+
similarity_func = SIMILARITY_FUNCTIONS.get(similarity_metric, cosine_similarity)
|
| 46 |
+
|
| 47 |
+
# Calculate similarity for each chunk in this document
|
| 48 |
+
for chunk_info in doc_chunks:
|
| 49 |
+
chunk_embedding = chunk_info['embedding']
|
| 50 |
+
similarity = similarity_func(query_embedding, chunk_embedding)
|
| 51 |
+
|
| 52 |
+
chunk_similarities.append({
|
| 53 |
+
'document': doc_name,
|
| 54 |
+
'chunk_id': chunk_info['chunk_id'],
|
| 55 |
+
'text': chunk_info['text'],
|
| 56 |
+
'start_char': chunk_info.get('start_char', 0),
|
| 57 |
+
'end_char': chunk_info.get('end_char', len(chunk_info['text'])),
|
| 58 |
+
'token_count': chunk_info.get('token_count', len(chunk_info['text'].split())),
|
| 59 |
+
'similarity': similarity
|
| 60 |
+
})
|
| 61 |
+
|
| 62 |
+
# Get top chunks from this document
|
| 63 |
+
chunk_similarities.sort(key=lambda x: x['similarity'], reverse=True)
|
| 64 |
+
top_chunks = chunk_similarities[:top_chunks_per_doc]
|
| 65 |
+
all_relevant_chunks.extend(top_chunks)
|
| 66 |
+
|
| 67 |
+
# Sort all chunks by similarity
|
| 68 |
+
all_relevant_chunks.sort(key=lambda x: x['similarity'], reverse=True)
|
| 69 |
+
|
| 70 |
+
print(f"π Found {len(all_relevant_chunks)} relevant chunks (Top-K)")
|
| 71 |
+
for i, chunk in enumerate(all_relevant_chunks[:5]): # Show top 5
|
| 72 |
+
print(f" {i+1}. {chunk['document']} (chunk {chunk['chunk_id']}, similarity: {chunk['similarity']:.3f})")
|
| 73 |
+
print(f" Preview: {chunk['text'][:100]}...")
|
| 74 |
+
|
| 75 |
+
return all_relevant_chunks
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def find_relevant_chunks_top_p(query: str, model: SentenceTransformer,
|
| 79 |
+
relevant_docs: List[str], chunk_embeddings: Dict,
|
| 80 |
+
top_p: float = 0.6, min_similarity: float = 0.3,
|
| 81 |
+
similarity_metric: str = "cosine") -> List[Dict]:
|
| 82 |
+
"""Find most relevant chunks using Top-P strategy for better quality control."""
|
| 83 |
+
query_embedding = create_text_embedding(model, query)
|
| 84 |
+
|
| 85 |
+
# Collect all chunks from all relevant documents
|
| 86 |
+
all_chunk_similarities = []
|
| 87 |
+
|
| 88 |
+
for doc_name in relevant_docs:
|
| 89 |
+
if doc_name not in chunk_embeddings:
|
| 90 |
+
continue
|
| 91 |
+
|
| 92 |
+
doc_chunks = chunk_embeddings[doc_name]
|
| 93 |
+
|
| 94 |
+
# Get similarity function
|
| 95 |
+
similarity_func = SIMILARITY_FUNCTIONS.get(similarity_metric, cosine_similarity)
|
| 96 |
+
|
| 97 |
+
# Calculate similarity for each chunk in this document
|
| 98 |
+
for chunk_info in doc_chunks:
|
| 99 |
+
chunk_embedding = chunk_info['embedding']
|
| 100 |
+
similarity = similarity_func(query_embedding, chunk_embedding)
|
| 101 |
+
|
| 102 |
+
# Only include chunks above minimum similarity threshold
|
| 103 |
+
if similarity >= min_similarity:
|
| 104 |
+
all_chunk_similarities.append({
|
| 105 |
+
'document': doc_name,
|
| 106 |
+
'chunk_id': chunk_info['chunk_id'],
|
| 107 |
+
'text': chunk_info['text'],
|
| 108 |
+
'start_char': chunk_info.get('start_char', 0),
|
| 109 |
+
'end_char': chunk_info.get('end_char', len(chunk_info['text'])),
|
| 110 |
+
'token_count': chunk_info.get('token_count', len(chunk_info['text'].split())),
|
| 111 |
+
'similarity': similarity
|
| 112 |
+
})
|
| 113 |
+
|
| 114 |
+
if not all_chunk_similarities:
|
| 115 |
+
print(f"β οΈ No chunks found above similarity threshold {min_similarity}")
|
| 116 |
+
return []
|
| 117 |
+
|
| 118 |
+
# Sort by similarity
|
| 119 |
+
all_chunk_similarities.sort(key=lambda x: x['similarity'], reverse=True)
|
| 120 |
+
|
| 121 |
+
# Apply Top-P selection
|
| 122 |
+
total_score = sum(chunk['similarity'] for chunk in all_chunk_similarities)
|
| 123 |
+
cumulative_prob = 0.0
|
| 124 |
+
selected_chunks = []
|
| 125 |
+
|
| 126 |
+
for chunk in all_chunk_similarities:
|
| 127 |
+
prob = chunk['similarity'] / total_score
|
| 128 |
+
cumulative_prob += prob
|
| 129 |
+
selected_chunks.append(chunk)
|
| 130 |
+
|
| 131 |
+
# Stop when we reach the Top-P threshold
|
| 132 |
+
if cumulative_prob >= top_p:
|
| 133 |
+
break
|
| 134 |
+
|
| 135 |
+
print(f"π Found {len(selected_chunks)} relevant chunks (Top-P={top_p})")
|
| 136 |
+
print(f"π Filtered from {len(all_chunk_similarities)} chunks above threshold")
|
| 137 |
+
print(f"π Cumulative probability: {cumulative_prob:.3f}")
|
| 138 |
+
|
| 139 |
+
for i, chunk in enumerate(selected_chunks[:5]): # Show top 5
|
| 140 |
+
print(f" {i+1}. {chunk['document']} (chunk {chunk['chunk_id']}, similarity: {chunk['similarity']:.3f})")
|
| 141 |
+
print(f" Preview: {chunk['text'][:100]}...")
|
| 142 |
+
|
| 143 |
+
return selected_chunks
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def find_relevant_chunks(query: str, model: SentenceTransformer,
|
| 147 |
+
relevant_docs: List[str], chunk_embeddings: Dict,
|
| 148 |
+
strategy: str = "top_p", **kwargs) -> List[Dict]:
|
| 149 |
+
"""Unified interface for chunk retrieval with different strategies."""
|
| 150 |
+
|
| 151 |
+
similarity_metric = kwargs.get("similarity_metric", "cosine")
|
| 152 |
+
|
| 153 |
+
if strategy == "top_k":
|
| 154 |
+
top_chunks_per_doc = kwargs.get("top_chunks_per_doc", 3)
|
| 155 |
+
return find_relevant_chunks_top_k(query, model, relevant_docs, chunk_embeddings,
|
| 156 |
+
top_chunks_per_doc, similarity_metric)
|
| 157 |
+
|
| 158 |
+
elif strategy == "top_p":
|
| 159 |
+
top_p = kwargs.get("top_p", 0.6)
|
| 160 |
+
min_similarity = kwargs.get("min_similarity", 0.3)
|
| 161 |
+
return find_relevant_chunks_top_p(query, model, relevant_docs, chunk_embeddings,
|
| 162 |
+
top_p, min_similarity, similarity_metric)
|
| 163 |
+
|
| 164 |
+
else:
|
| 165 |
+
raise ValueError(f"Unknown strategy: {strategy}. Use 'top_k' or 'top_p'")
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def get_documents_for_rag(relevant_docs: List[str], document_index: Dict) -> List[str]:
|
| 169 |
+
"""Get full content of relevant documents for RAG processing."""
|
| 170 |
+
rag_documents = []
|
| 171 |
+
|
| 172 |
+
for doc_name in relevant_docs:
|
| 173 |
+
if doc_name in document_index:
|
| 174 |
+
content = document_index[doc_name].get('full_content', document_index[doc_name].get('content', ''))
|
| 175 |
+
if content.strip():
|
| 176 |
+
rag_documents.append(content)
|
| 177 |
+
|
| 178 |
+
print(f"π Retrieved {len(rag_documents)} documents for RAG")
|
| 179 |
+
return rag_documents
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def get_chunks_for_rag(relevant_chunks: List[Dict], max_chunks: int = 10) -> List[str]:
|
| 183 |
+
"""Get the most relevant chunks for RAG processing."""
|
| 184 |
+
# Take top chunks and format them with context
|
| 185 |
+
selected_chunks = relevant_chunks[:max_chunks]
|
| 186 |
+
|
| 187 |
+
rag_chunks = []
|
| 188 |
+
for chunk in selected_chunks:
|
| 189 |
+
formatted_chunk = f"[Document: {chunk['document']}, Chunk {chunk['chunk_id']}]\n{chunk['text']}"
|
| 190 |
+
rag_chunks.append(formatted_chunk)
|
| 191 |
+
|
| 192 |
+
print(f"π Retrieved {len(rag_chunks)} chunks for RAG")
|
| 193 |
+
return rag_chunks
|
src/pdf-version/retrieval/document_retriever.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Document retrieval strategies and functionality."""
|
| 2 |
+
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
from src.indexing.embedding_creator import create_text_embedding
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def find_relevant_documents_top_k(query: str, model: SentenceTransformer,
|
| 10 |
+
tag_embeddings: Dict, doc_tag_mapping: Dict,
|
| 11 |
+
top_k: int = 3) -> List[str]:
|
| 12 |
+
"""Find top-k most relevant documents based on query similarity to tags."""
|
| 13 |
+
query_embedding = create_text_embedding(model, query)
|
| 14 |
+
|
| 15 |
+
# Calculate similarity between query and all tags
|
| 16 |
+
tag_similarities = {}
|
| 17 |
+
for tag, tag_embedding in tag_embeddings.items():
|
| 18 |
+
similarity = np.dot(query_embedding, tag_embedding) / (
|
| 19 |
+
np.linalg.norm(query_embedding) * np.linalg.norm(tag_embedding)
|
| 20 |
+
)
|
| 21 |
+
tag_similarities[tag] = similarity
|
| 22 |
+
|
| 23 |
+
# Find documents that contain the most similar tags
|
| 24 |
+
doc_scores = {}
|
| 25 |
+
for pdf_name, doc_info in doc_tag_mapping.items():
|
| 26 |
+
doc_tags = doc_info['tags']
|
| 27 |
+
|
| 28 |
+
# Calculate document score using max similarity for precise tag matching
|
| 29 |
+
if doc_tags:
|
| 30 |
+
similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
|
| 31 |
+
# Use max similarity to find documents with best tag matches
|
| 32 |
+
doc_score = max(similarities)
|
| 33 |
+
doc_scores[pdf_name] = doc_score
|
| 34 |
+
|
| 35 |
+
# Sort and return top-k documents
|
| 36 |
+
sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
|
| 37 |
+
relevant_docs = [doc_name for doc_name, score in sorted_docs[:top_k]]
|
| 38 |
+
|
| 39 |
+
print(f"π Found {len(relevant_docs)} relevant documents for query: '{query}' (TOP-K)")
|
| 40 |
+
for i, doc_name in enumerate(relevant_docs):
|
| 41 |
+
score = doc_scores[doc_name]
|
| 42 |
+
print(f" {i+1}. {doc_name} (similarity: {score:.3f})")
|
| 43 |
+
|
| 44 |
+
return relevant_docs
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def find_relevant_documents_top_p(query: str, model: SentenceTransformer,
|
| 48 |
+
tag_embeddings: Dict, doc_tag_mapping: Dict,
|
| 49 |
+
top_p: float = 0.6, min_similarity: float = 0.5) -> List[str]:
|
| 50 |
+
"""Find documents using TOP-P (nucleus sampling) approach."""
|
| 51 |
+
query_embedding = create_text_embedding(model, query)
|
| 52 |
+
|
| 53 |
+
# Calculate similarity between query and all tags
|
| 54 |
+
tag_similarities = {}
|
| 55 |
+
for tag, tag_embedding in tag_embeddings.items():
|
| 56 |
+
similarity = np.dot(query_embedding, tag_embedding) / (
|
| 57 |
+
np.linalg.norm(query_embedding) * np.linalg.norm(tag_embedding)
|
| 58 |
+
)
|
| 59 |
+
tag_similarities[tag] = similarity
|
| 60 |
+
|
| 61 |
+
# Find documents that contain the most similar tags
|
| 62 |
+
doc_scores = {}
|
| 63 |
+
for pdf_name, doc_info in doc_tag_mapping.items():
|
| 64 |
+
doc_tags = doc_info['tags']
|
| 65 |
+
|
| 66 |
+
# Calculate document score using max similarity for precise tag matching
|
| 67 |
+
if doc_tags:
|
| 68 |
+
similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
|
| 69 |
+
# Use max similarity to find documents with best tag matches
|
| 70 |
+
doc_score = max(similarities)
|
| 71 |
+
doc_scores[pdf_name] = doc_score
|
| 72 |
+
|
| 73 |
+
# Filter out documents below minimum similarity threshold
|
| 74 |
+
filtered_docs = {doc: score for doc, score in doc_scores.items()
|
| 75 |
+
if score >= min_similarity}
|
| 76 |
+
|
| 77 |
+
if not filtered_docs:
|
| 78 |
+
print(f"β οΈ No documents found above similarity threshold {min_similarity}")
|
| 79 |
+
return []
|
| 80 |
+
|
| 81 |
+
# Sort documents by similarity score
|
| 82 |
+
sorted_docs = sorted(filtered_docs.items(), key=lambda x: x[1], reverse=True)
|
| 83 |
+
|
| 84 |
+
# Apply TOP-P selection
|
| 85 |
+
total_score = sum(score for _, score in sorted_docs)
|
| 86 |
+
cumulative_prob = 0.0
|
| 87 |
+
selected_docs = []
|
| 88 |
+
|
| 89 |
+
for doc_name, score in sorted_docs:
|
| 90 |
+
prob = score / total_score
|
| 91 |
+
cumulative_prob += prob
|
| 92 |
+
selected_docs.append(doc_name)
|
| 93 |
+
|
| 94 |
+
# Stop when we reach the TOP-P threshold
|
| 95 |
+
if cumulative_prob >= top_p:
|
| 96 |
+
break
|
| 97 |
+
|
| 98 |
+
print(f"π Found {len(selected_docs)} relevant documents for query: '{query}' (TOP-P={top_p})")
|
| 99 |
+
print(f"π Cumulative probability: {cumulative_prob:.3f}")
|
| 100 |
+
|
| 101 |
+
for i, doc_name in enumerate(selected_docs):
|
| 102 |
+
score = doc_scores[doc_name]
|
| 103 |
+
prob = score / total_score
|
| 104 |
+
print(f" {i+1}. {doc_name} (similarity: {score:.3f}, prob: {prob:.3f})")
|
| 105 |
+
|
| 106 |
+
return selected_docs
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def find_relevant_documents_threshold(query: str, model: SentenceTransformer,
|
| 110 |
+
tag_embeddings: Dict, doc_tag_mapping: Dict,
|
| 111 |
+
similarity_threshold: float = 0.5) -> List[str]:
|
| 112 |
+
"""Find all documents above a similarity threshold."""
|
| 113 |
+
query_embedding = create_text_embedding(model, query)
|
| 114 |
+
|
| 115 |
+
# Calculate similarity between query and all tags
|
| 116 |
+
tag_similarities = {}
|
| 117 |
+
for tag, tag_embedding in tag_embeddings.items():
|
| 118 |
+
similarity = np.dot(query_embedding, tag_embedding) / (
|
| 119 |
+
np.linalg.norm(query_embedding) * np.linalg.norm(tag_embedding)
|
| 120 |
+
)
|
| 121 |
+
tag_similarities[tag] = similarity
|
| 122 |
+
|
| 123 |
+
# Find documents that contain the most similar tags
|
| 124 |
+
doc_scores = {}
|
| 125 |
+
for pdf_name, doc_info in doc_tag_mapping.items():
|
| 126 |
+
doc_tags = doc_info['tags']
|
| 127 |
+
|
| 128 |
+
# Calculate document score using weighted average
|
| 129 |
+
if doc_tags:
|
| 130 |
+
similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
|
| 131 |
+
avg_similarity = np.mean(similarities)
|
| 132 |
+
max_similarity = max(similarities)
|
| 133 |
+
# Weighted combination: 70% average (overall relevance) + 30% max (strongest match)
|
| 134 |
+
doc_score = avg_similarity * 0.7 + max_similarity * 0.3
|
| 135 |
+
if doc_score >= similarity_threshold:
|
| 136 |
+
doc_scores[pdf_name] = doc_score
|
| 137 |
+
|
| 138 |
+
# Sort by similarity score
|
| 139 |
+
sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
|
| 140 |
+
relevant_docs = [doc_name for doc_name, score in sorted_docs]
|
| 141 |
+
|
| 142 |
+
print(f"π Found {len(relevant_docs)} relevant documents for query: '{query}' (threshold={similarity_threshold})")
|
| 143 |
+
for i, doc_name in enumerate(relevant_docs):
|
| 144 |
+
score = doc_scores[doc_name]
|
| 145 |
+
print(f" {i+1}. {doc_name} (similarity: {score:.3f})")
|
| 146 |
+
|
| 147 |
+
return relevant_docs
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def find_relevant_documents(query: str, model: SentenceTransformer,
|
| 151 |
+
tag_embeddings: Dict, doc_tag_mapping: Dict,
|
| 152 |
+
strategy: str = "top_k", **kwargs) -> List[str]:
|
| 153 |
+
"""Unified interface for finding relevant documents with different strategies."""
|
| 154 |
+
if strategy == "top_k":
|
| 155 |
+
top_k = kwargs.get("top_k", 3)
|
| 156 |
+
return find_relevant_documents_top_k(query, model, tag_embeddings, doc_tag_mapping, top_k)
|
| 157 |
+
|
| 158 |
+
elif strategy == "top_p":
|
| 159 |
+
top_p = kwargs.get("top_p", 0.6)
|
| 160 |
+
min_similarity = kwargs.get("min_similarity", 0.5)
|
| 161 |
+
return find_relevant_documents_top_p(query, model, tag_embeddings, doc_tag_mapping, top_p, min_similarity)
|
| 162 |
+
|
| 163 |
+
elif strategy == "threshold":
|
| 164 |
+
similarity_threshold = kwargs.get("similarity_threshold", 0.5)
|
| 165 |
+
return find_relevant_documents_threshold(query, model, tag_embeddings, doc_tag_mapping, similarity_threshold)
|
| 166 |
+
|
| 167 |
+
else:
|
| 168 |
+
raise ValueError(f"Unknown strategy: {strategy}. Use 'top_k', 'top_p', or 'threshold'")
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def create_document_tag_mapping(document_index: Dict, tag_embeddings: Dict) -> Dict:
|
| 172 |
+
"""Create mapping between documents and their tag embeddings."""
|
| 173 |
+
doc_tag_mapping = {}
|
| 174 |
+
|
| 175 |
+
for pdf_name, doc_info in document_index.items():
|
| 176 |
+
doc_tags = doc_info['all_tags']
|
| 177 |
+
|
| 178 |
+
# Get embeddings for this document's tags
|
| 179 |
+
doc_tag_embeddings = {}
|
| 180 |
+
for tag in doc_tags:
|
| 181 |
+
if tag in tag_embeddings:
|
| 182 |
+
doc_tag_embeddings[tag] = tag_embeddings[tag]
|
| 183 |
+
|
| 184 |
+
doc_tag_mapping[pdf_name] = {
|
| 185 |
+
'tags': doc_tags,
|
| 186 |
+
'tag_embeddings': doc_tag_embeddings,
|
| 187 |
+
'symptoms': doc_info['symptoms'],
|
| 188 |
+
'diagnoses': doc_info['diagnoses'],
|
| 189 |
+
'treatments': doc_info.get('treatments', [])
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
return doc_tag_mapping
|
src/pdf-version/utils/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utility functions."""
|
| 2 |
+
|
| 3 |
+
from .helpers import *
|
| 4 |
+
|
| 5 |
+
__all__ = []
|
src/pdf-version/utils/helpers.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utility functions and helpers."""
|
| 2 |
+
|
| 3 |
+
# Placeholder for utility functions
|
| 4 |
+
# Add common helper functions here as needed
|