Spaces:
Sleeping
Sleeping
modifying path to align with current file structure. folder structure needs rework
Browse files- src/pdf-version/data/loaders.py +14 -2
- src/pdf-version/demos/demo_runner.py +7 -7
- src/pdf-version/generate_embeddings.py +3 -3
- src/pdf-version/indexing/document_indexer.py +1 -1
- src/pdf-version/indexing/storage.py +17 -2
- src/pdf-version/main.py +3 -3
- src/pdf-version/oncall_ai.py +11 -11
- src/pdf-version/rag/medical_rag_pipeline.py +26 -8
src/pdf-version/data/loaders.py
CHANGED
|
@@ -5,7 +5,7 @@ import os
|
|
| 5 |
from typing import List, Dict
|
| 6 |
|
| 7 |
|
| 8 |
-
def load_annotations(file_path: str =
|
| 9 |
"""Load medical annotations from JSON file.
|
| 10 |
|
| 11 |
Args:
|
|
@@ -14,6 +14,12 @@ def load_annotations(file_path: str = 'mapping.json') -> List[Dict]:
|
|
| 14 |
Returns:
|
| 15 |
List of annotation dictionaries.
|
| 16 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
try:
|
| 18 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 19 |
annotations = json.load(f)
|
|
@@ -25,7 +31,7 @@ def load_annotations(file_path: str = 'mapping.json') -> List[Dict]:
|
|
| 25 |
return []
|
| 26 |
|
| 27 |
|
| 28 |
-
def filter_pdf_files(annotations: List[Dict], assets_dir: str =
|
| 29 |
"""Filter and validate PDF files from annotations.
|
| 30 |
|
| 31 |
Args:
|
|
@@ -35,6 +41,12 @@ def filter_pdf_files(annotations: List[Dict], assets_dir: str = "assets") -> Lis
|
|
| 35 |
Returns:
|
| 36 |
List of valid PDF filenames.
|
| 37 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
pdf_files = []
|
| 39 |
|
| 40 |
for item in annotations:
|
|
|
|
| 5 |
from typing import List, Dict
|
| 6 |
|
| 7 |
|
| 8 |
+
def load_annotations(file_path: str = None) -> List[Dict]:
|
| 9 |
"""Load medical annotations from JSON file.
|
| 10 |
|
| 11 |
Args:
|
|
|
|
| 14 |
Returns:
|
| 15 |
List of annotation dictionaries.
|
| 16 |
"""
|
| 17 |
+
if file_path is None:
|
| 18 |
+
# Get project root directory (3 levels up from this file)
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
| 21 |
+
file_path = root_dir / 'embeddings' / 'mapping.json'
|
| 22 |
+
|
| 23 |
try:
|
| 24 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 25 |
annotations = json.load(f)
|
|
|
|
| 31 |
return []
|
| 32 |
|
| 33 |
|
| 34 |
+
def filter_pdf_files(annotations: List[Dict], assets_dir: str = None) -> List[str]:
|
| 35 |
"""Filter and validate PDF files from annotations.
|
| 36 |
|
| 37 |
Args:
|
|
|
|
| 41 |
Returns:
|
| 42 |
List of valid PDF filenames.
|
| 43 |
"""
|
| 44 |
+
if assets_dir is None:
|
| 45 |
+
# Get project root directory
|
| 46 |
+
from pathlib import Path
|
| 47 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
| 48 |
+
assets_dir = root_dir / 'assets'
|
| 49 |
+
|
| 50 |
pdf_files = []
|
| 51 |
|
| 52 |
for item in annotations:
|
src/pdf-version/demos/demo_runner.py
CHANGED
|
@@ -2,13 +2,13 @@
|
|
| 2 |
|
| 3 |
from typing import Optional
|
| 4 |
|
| 5 |
-
from
|
| 6 |
-
from
|
| 7 |
-
from
|
| 8 |
-
from
|
| 9 |
-
from
|
| 10 |
-
from
|
| 11 |
-
from
|
| 12 |
|
| 13 |
|
| 14 |
def build_medical_rag_system(enable_chunk_embeddings: bool = True):
|
|
|
|
| 2 |
|
| 3 |
from typing import Optional
|
| 4 |
|
| 5 |
+
from models.embedding_models import load_biomedbert_model
|
| 6 |
+
from data.loaders import load_annotations
|
| 7 |
+
from indexing.document_indexer import build_document_index
|
| 8 |
+
from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
|
| 9 |
+
from indexing.storage import save_document_system, load_document_system
|
| 10 |
+
from retrieval.document_retriever import create_document_tag_mapping, find_relevant_documents
|
| 11 |
+
from retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
|
| 12 |
|
| 13 |
|
| 14 |
def build_medical_rag_system(enable_chunk_embeddings: bool = True):
|
src/pdf-version/generate_embeddings.py
CHANGED
|
@@ -6,10 +6,10 @@ Quick script to generate new embeddings with sentence-based chunking
|
|
| 6 |
import sys
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
-
# Add
|
| 10 |
-
sys.path.insert(0, str(Path(__file__).parent
|
| 11 |
|
| 12 |
-
from
|
| 13 |
|
| 14 |
def main():
|
| 15 |
print("π Starting to build medical RAG system with new sentence-based chunking...")
|
|
|
|
| 6 |
import sys
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
+
# Add pdf-version directory to Python path
|
| 10 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 11 |
|
| 12 |
+
from demos.demo_runner import build_medical_rag_system
|
| 13 |
|
| 14 |
def main():
|
| 15 |
print("π Starting to build medical RAG system with new sentence-based chunking...")
|
src/pdf-version/indexing/document_indexer.py
CHANGED
|
@@ -4,7 +4,7 @@ import os
|
|
| 4 |
from typing import List, Dict
|
| 5 |
from llama_index.core import Document
|
| 6 |
from llama_index.core.node_parser import SentenceSplitter
|
| 7 |
-
from
|
| 8 |
|
| 9 |
|
| 10 |
def split_text_into_chunks(text: str, chunk_size: int = 256, chunk_overlap: int = 25) -> List[Dict]:
|
|
|
|
| 4 |
from typing import List, Dict
|
| 5 |
from llama_index.core import Document
|
| 6 |
from llama_index.core.node_parser import SentenceSplitter
|
| 7 |
+
from data.pdf_processing import extract_pdf_content_enhanced
|
| 8 |
|
| 9 |
|
| 10 |
def split_text_into_chunks(text: str, chunk_size: int = 256, chunk_overlap: int = 25) -> List[Dict]:
|
src/pdf-version/indexing/storage.py
CHANGED
|
@@ -8,7 +8,7 @@ import numpy as np
|
|
| 8 |
|
| 9 |
def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
| 10 |
doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
|
| 11 |
-
output_dir: str =
|
| 12 |
"""Save the complete document indexing system.
|
| 13 |
|
| 14 |
Args:
|
|
@@ -19,6 +19,15 @@ def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
|
| 19 |
output_dir: Output directory for saved files.
|
| 20 |
"""
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Save document index (content + metadata + chunks)
|
| 23 |
doc_index_serializable = {}
|
| 24 |
for doc_name, doc_info in document_index.items():
|
|
@@ -79,7 +88,7 @@ def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
|
| 79 |
print("β
Document system saved to files")
|
| 80 |
|
| 81 |
|
| 82 |
-
def load_document_system(input_dir: str =
|
| 83 |
"""Load the complete document indexing system.
|
| 84 |
|
| 85 |
Args:
|
|
@@ -89,6 +98,12 @@ def load_document_system(input_dir: str = ".") -> Tuple[Optional[Dict], Optional
|
|
| 89 |
Tuple of (document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings).
|
| 90 |
Returns (None, None, None, None) if loading fails.
|
| 91 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
try:
|
| 93 |
# Load document index
|
| 94 |
with open(os.path.join(input_dir, 'document_index.json'), 'r', encoding='utf-8') as f:
|
|
|
|
| 8 |
|
| 9 |
def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
| 10 |
doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
|
| 11 |
+
output_dir: str = None):
|
| 12 |
"""Save the complete document indexing system.
|
| 13 |
|
| 14 |
Args:
|
|
|
|
| 19 |
output_dir: Output directory for saved files.
|
| 20 |
"""
|
| 21 |
|
| 22 |
+
if output_dir is None:
|
| 23 |
+
# Get project root directory
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
| 26 |
+
output_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
| 27 |
+
|
| 28 |
+
# Ensure output directory exists
|
| 29 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 30 |
+
|
| 31 |
# Save document index (content + metadata + chunks)
|
| 32 |
doc_index_serializable = {}
|
| 33 |
for doc_name, doc_info in document_index.items():
|
|
|
|
| 88 |
print("β
Document system saved to files")
|
| 89 |
|
| 90 |
|
| 91 |
+
def load_document_system(input_dir: str = None) -> Tuple[Optional[Dict], Optional[Dict], Optional[Dict], Optional[Dict]]:
|
| 92 |
"""Load the complete document indexing system.
|
| 93 |
|
| 94 |
Args:
|
|
|
|
| 98 |
Tuple of (document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings).
|
| 99 |
Returns (None, None, None, None) if loading fails.
|
| 100 |
"""
|
| 101 |
+
if input_dir is None:
|
| 102 |
+
# Get project root directory
|
| 103 |
+
from pathlib import Path
|
| 104 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
| 105 |
+
input_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
| 106 |
+
|
| 107 |
try:
|
| 108 |
# Load document index
|
| 109 |
with open(os.path.join(input_dir, 'document_index.json'), 'r', encoding='utf-8') as f:
|
src/pdf-version/main.py
CHANGED
|
@@ -7,10 +7,10 @@ Main entry point for the medical RAG system.
|
|
| 7 |
import sys
|
| 8 |
from pathlib import Path
|
| 9 |
|
| 10 |
-
# Add
|
| 11 |
-
sys.path.insert(0, str(Path(__file__).parent
|
| 12 |
|
| 13 |
-
from
|
| 14 |
|
| 15 |
|
| 16 |
def main():
|
|
|
|
| 7 |
import sys
|
| 8 |
from pathlib import Path
|
| 9 |
|
| 10 |
+
# Add pdf-version directory to Python path
|
| 11 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 12 |
|
| 13 |
+
from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
|
| 14 |
|
| 15 |
|
| 16 |
def main():
|
src/pdf-version/oncall_ai.py
CHANGED
|
@@ -8,26 +8,26 @@ Import everything from the new modular structure.
|
|
| 8 |
import sys
|
| 9 |
from pathlib import Path
|
| 10 |
|
| 11 |
-
# Add
|
| 12 |
-
sys.path.insert(0, str(Path(__file__).parent
|
| 13 |
|
| 14 |
# Import all functions for backward compatibility
|
| 15 |
-
from
|
| 16 |
-
from
|
| 17 |
-
from
|
| 18 |
extract_pdf_text, extract_tables_from_pdf,
|
| 19 |
extract_images_ocr_from_pdf, extract_pdf_content_enhanced
|
| 20 |
)
|
| 21 |
-
from
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
-
from
|
| 25 |
find_relevant_documents_top_k, find_relevant_documents_top_p,
|
| 26 |
find_relevant_documents_threshold, find_relevant_documents,
|
| 27 |
create_document_tag_mapping
|
| 28 |
)
|
| 29 |
-
from
|
| 30 |
-
from
|
| 31 |
|
| 32 |
# Main function for backward compatibility
|
| 33 |
def main():
|
|
|
|
| 8 |
import sys
|
| 9 |
from pathlib import Path
|
| 10 |
|
| 11 |
+
# Add pdf-version directory to Python path
|
| 12 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 13 |
|
| 14 |
# Import all functions for backward compatibility
|
| 15 |
+
from models.embedding_models import load_biomedbert_model, load_meditron_model
|
| 16 |
+
from data.loaders import load_annotations, filter_pdf_files
|
| 17 |
+
from data.pdf_processing import (
|
| 18 |
extract_pdf_text, extract_tables_from_pdf,
|
| 19 |
extract_images_ocr_from_pdf, extract_pdf_content_enhanced
|
| 20 |
)
|
| 21 |
+
from indexing.document_indexer import build_document_index, split_text_into_chunks
|
| 22 |
+
from indexing.embedding_creator import create_text_embedding, create_tag_embeddings, create_chunk_embeddings
|
| 23 |
+
from indexing.storage import save_document_system, load_document_system
|
| 24 |
+
from retrieval.document_retriever import (
|
| 25 |
find_relevant_documents_top_k, find_relevant_documents_top_p,
|
| 26 |
find_relevant_documents_threshold, find_relevant_documents,
|
| 27 |
create_document_tag_mapping
|
| 28 |
)
|
| 29 |
+
from retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
|
| 30 |
+
from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
|
| 31 |
|
| 32 |
# Main function for backward compatibility
|
| 33 |
def main():
|
src/pdf-version/rag/medical_rag_pipeline.py
CHANGED
|
@@ -7,10 +7,6 @@ from typing import Dict, List, Optional, Tuple
|
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
|
| 9 |
# Import existing retrieval components
|
| 10 |
-
import sys
|
| 11 |
-
import os
|
| 12 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 13 |
-
|
| 14 |
from retrieval.document_retriever import find_relevant_documents
|
| 15 |
from retrieval.chunk_retriever import find_relevant_chunks, get_chunks_for_rag
|
| 16 |
from models.embedding_models import load_biomedbert_model
|
|
@@ -391,10 +387,10 @@ def answer_medical_query(query: str,
|
|
| 391 |
return complete_result
|
| 392 |
|
| 393 |
|
| 394 |
-
def load_rag_data(tag_embeddings_path: str =
|
| 395 |
-
chunk_embeddings_path: str =
|
| 396 |
-
doc_tag_mapping_path: str =
|
| 397 |
-
document_index_path: str =
|
| 398 |
"""
|
| 399 |
Load all RAG data needed for medical question answering.
|
| 400 |
|
|
@@ -409,6 +405,28 @@ def load_rag_data(tag_embeddings_path: str = "tag_embeddings.json",
|
|
| 409 |
"""
|
| 410 |
print("π Loading Medical RAG Data...")
|
| 411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
# Load embedding model
|
| 413 |
print("π¦ Loading BGE Large Medical embedding model...")
|
| 414 |
embedding_model = load_biomedbert_model()
|
|
|
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
|
| 9 |
# Import existing retrieval components
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
from retrieval.document_retriever import find_relevant_documents
|
| 11 |
from retrieval.chunk_retriever import find_relevant_chunks, get_chunks_for_rag
|
| 12 |
from models.embedding_models import load_biomedbert_model
|
|
|
|
| 387 |
return complete_result
|
| 388 |
|
| 389 |
|
| 390 |
+
def load_rag_data(tag_embeddings_path: str = None,
|
| 391 |
+
chunk_embeddings_path: str = None,
|
| 392 |
+
doc_tag_mapping_path: str = None,
|
| 393 |
+
document_index_path: str = None) -> Tuple[SentenceTransformer, Dict, Dict, Dict, Dict]:
|
| 394 |
"""
|
| 395 |
Load all RAG data needed for medical question answering.
|
| 396 |
|
|
|
|
| 405 |
"""
|
| 406 |
print("π Loading Medical RAG Data...")
|
| 407 |
|
| 408 |
+
# Set default paths if not provided
|
| 409 |
+
if tag_embeddings_path is None:
|
| 410 |
+
from pathlib import Path
|
| 411 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
| 412 |
+
embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
| 413 |
+
tag_embeddings_path = embeddings_dir / 'tag_embeddings.json'
|
| 414 |
+
if chunk_embeddings_path is None:
|
| 415 |
+
from pathlib import Path
|
| 416 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
| 417 |
+
embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
| 418 |
+
chunk_embeddings_path = embeddings_dir / 'chunk_embeddings.json'
|
| 419 |
+
if doc_tag_mapping_path is None:
|
| 420 |
+
from pathlib import Path
|
| 421 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
| 422 |
+
embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
| 423 |
+
doc_tag_mapping_path = embeddings_dir / 'document_tag_mapping.json'
|
| 424 |
+
if document_index_path is None:
|
| 425 |
+
from pathlib import Path
|
| 426 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
| 427 |
+
embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
| 428 |
+
document_index_path = embeddings_dir / 'document_index.json'
|
| 429 |
+
|
| 430 |
# Load embedding model
|
| 431 |
print("π¦ Loading BGE Large Medical embedding model...")
|
| 432 |
embedding_model = load_biomedbert_model()
|