Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Enhanced Medical Context Preparation System | |
| VedaMD Medical RAG - Phase 2: Task 2.1 | |
| This module enhances medical context preparation by: | |
| 1. Extracting medical entities from PROVIDED documents only | |
| 2. Creating clinical relationship mappings within source guidelines | |
| 3. Standardizing medical terminology from source documents | |
| 4. Adding medical metadata while maintaining strict context boundaries | |
| CRITICAL SAFETY PROTOCOL: | |
| - NO external medical knowledge injection | |
| - ALL enhancements derived from provided Sri Lankan guidelines | |
| - Maintains 100% source traceability for regulatory compliance | |
| """ | |
| import re | |
| import logging | |
| from typing import List, Dict, Set, Tuple, Optional, Any | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| import json | |
| from collections import defaultdict, Counter | |
| # Medical entity patterns for Sri Lankan clinical guidelines | |
| MEDICAL_ENTITY_PATTERNS = { | |
| # Medications and dosages (extract from provided context only) | |
| 'medications': [ | |
| r'\b(?:magnesium sulfate|MgSO4|oxytocin|methyldopa|nifedipine|labetalol|hydralazine)\b', | |
| r'\b(?:ampicillin|gentamicin|ceftriaxone|azithromycin|doxycycline)\b', | |
| r'\b(?:insulin|metformin|glibenclamide|aspirin|atorvastatin)\b' | |
| ], | |
| 'dosages': [ | |
| r'\b\d+(?:\.\d+)?\s*(?:mg|g|ml|units?|tablets?|caps)\b', | |
| r'\b(?:low|moderate|high|maximum|minimum)\s+dose\b' | |
| ], | |
| # Clinical conditions (from Sri Lankan guidelines) | |
| 'conditions': [ | |
| r'\b(?:preeclampsia|eclampsia|HELLP syndrome|gestational hypertension)\b', | |
| r'\b(?:postpartum hemorrhage|PPH|retained placenta|uterine atony)\b', | |
| r'\b(?:puerperal sepsis|endometritis|wound infection)\b', | |
| r'\b(?:gestational diabetes|GDM|diabetes mellitus)\b' | |
| ], | |
| # Procedures and interventions | |
| 'procedures': [ | |
| r'\b(?:cesarean section|C-section|vaginal delivery|assisted delivery)\b', | |
| r'\b(?:blood pressure monitoring|fetal monitoring|CTG)\b', | |
| r'\b(?:IV access|urinary catheter|nasogastric tube)\b' | |
| ], | |
| # Vital signs and measurements | |
| 'vitals': [ | |
| r'\b(?:blood pressure|BP)\s*(?:β₯|>=|>|<|β€|<=)?\s*\d+/\d+\s*mmHg\b', | |
| r'\b(?:heart rate|HR|pulse)\s*\d+\s*bpm\b', | |
| r'\b(?:temperature|temp)\s*\d+(?:\.\d+)?\s*Β°?[CF]\b' | |
| ], | |
| # Evidence levels (from Sri Lankan guidelines) | |
| 'evidence_levels': [ | |
| r'\b(?:Level I|Level II|Level III|Grade A|Grade B|Grade C)\s+evidence\b', | |
| r'\b(?:Expert consensus|Clinical recommendation|Strong recommendation)\b' | |
| ] | |
| } | |
| # Clinical relationship indicators (extract relationships from provided docs) | |
| CLINICAL_RELATIONSHIPS = { | |
| 'causes': [r'\bcause[sd]?\s+by\b', r'\bdue to\b', r'\bresult[s]?\s+from\b'], | |
| 'treatments': [r'\btreat(?:ed|ment)?\s+with\b', r'\bmanage[d]?\s+with\b', r'\badminister\b'], | |
| 'contraindications': [r'\bcontraindicated\b', r'\bavoid\b', r'\bdo not\b'], | |
| 'indications': [r'\bindicated\s+for\b', r'\brecommended\s+for\b', r'\bused\s+to\s+treat\b'], | |
| 'side_effects': [r'\bside effects?\b', r'\badverse\s+(?:effects?|reactions?)\b', r'\bcomplications?\b'], | |
| 'dosing': [r'\bdose\b', r'\bdosage\b', r'\badminister\s+\d+', r'\bgive\s+\d+'] | |
| } | |
| class MedicalEntity: | |
| """Medical entity extracted from source documents""" | |
| text: str | |
| entity_type: str | |
| source_document: str | |
| context: str | |
| confidence: float | |
| line_number: Optional[int] = None | |
| class ClinicalRelationship: | |
| """Clinical relationship between medical entities within source documents""" | |
| entity1: str | |
| relationship_type: str | |
| entity2: str | |
| source_document: str | |
| context: str | |
| confidence: float | |
| class EnhancedMedicalContext: | |
| """Enhanced medical context with entities and relationships from source docs""" | |
| original_content: str | |
| medical_entities: List[MedicalEntity] | |
| clinical_relationships: List[ClinicalRelationship] | |
| medical_concepts: Dict[str, List[str]] | |
| terminology_mappings: Dict[str, str] | |
| evidence_level: Optional[str] | |
| source_metadata: Dict[str, Any] | |
| class MedicalContextEnhancer: | |
| """ | |
| Enhanced medical context preparation system that maintains strict source boundaries | |
| """ | |
| def __init__(self): | |
| self.setup_logging() | |
| self.medical_terminology_db = self._load_terminology_mappings() | |
| def setup_logging(self): | |
| """Setup logging for medical context enhancement""" | |
| logging.basicConfig(level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| self.logger = logging.getLogger(__name__) | |
| def _load_terminology_mappings(self) -> Dict[str, str]: | |
| """Load medical terminology mappings from processed documents""" | |
| # This will be populated from actual Sri Lankan guideline terminology | |
| return { | |
| # Abbreviation mappings found in source documents | |
| 'PPH': 'Postpartum Hemorrhage', | |
| 'MgSO4': 'Magnesium Sulfate', | |
| 'BP': 'Blood Pressure', | |
| 'CTG': 'Cardiotocography', | |
| 'IV': 'Intravenous', | |
| 'GDM': 'Gestational Diabetes Mellitus', | |
| 'HELLP': 'Hemolysis, Elevated Liver enzymes, Low Platelets' | |
| } | |
| def extract_medical_entities(self, content: str, source_document: str) -> List[MedicalEntity]: | |
| """ | |
| Extract medical entities from provided document content only | |
| """ | |
| entities = [] | |
| lines = content.split('\n') | |
| for line_num, line in enumerate(lines, 1): | |
| line_lower = line.lower() | |
| for entity_type, patterns in MEDICAL_ENTITY_PATTERNS.items(): | |
| for pattern in patterns: | |
| matches = re.finditer(pattern, line, re.IGNORECASE) | |
| for match in matches: | |
| entity = MedicalEntity( | |
| text=match.group(), | |
| entity_type=entity_type, | |
| source_document=source_document, | |
| context=line.strip(), | |
| confidence=self._calculate_entity_confidence(match.group(), line), | |
| line_number=line_num | |
| ) | |
| entities.append(entity) | |
| self.logger.info(f"Extracted {len(entities)} medical entities from {source_document}") | |
| return entities | |
| def extract_clinical_relationships(self, content: str, source_document: str, | |
| entities: List[MedicalEntity]) -> List[ClinicalRelationship]: | |
| """ | |
| Extract clinical relationships between entities within the same source document | |
| """ | |
| relationships = [] | |
| lines = content.split('\n') | |
| # Create entity lookup for this document | |
| entity_texts = {entity.text.lower() for entity in entities | |
| if entity.source_document == source_document} | |
| for line_num, line in enumerate(lines, 1): | |
| line_lower = line.lower() | |
| # Find entities in this line | |
| line_entities = [text for text in entity_texts if text in line_lower] | |
| if len(line_entities) >= 2: | |
| # Look for relationship indicators | |
| for rel_type, patterns in CLINICAL_RELATIONSHIPS.items(): | |
| for pattern in patterns: | |
| if re.search(pattern, line, re.IGNORECASE): | |
| # Create relationships between entities in this line | |
| for i, entity1 in enumerate(line_entities): | |
| for entity2 in line_entities[i+1:]: | |
| relationship = ClinicalRelationship( | |
| entity1=entity1, | |
| relationship_type=rel_type, | |
| entity2=entity2, | |
| source_document=source_document, | |
| context=line.strip(), | |
| confidence=self._calculate_relationship_confidence(pattern, line) | |
| ) | |
| relationships.append(relationship) | |
| self.logger.info(f"Extracted {len(relationships)} clinical relationships from {source_document}") | |
| return relationships | |
| def standardize_medical_terminology(self, content: str) -> str: | |
| """ | |
| Standardize medical terminology based on mappings found in source documents | |
| """ | |
| standardized_content = content | |
| for abbreviation, full_form in self.medical_terminology_db.items(): | |
| # Only standardize if both forms appear in the source documents | |
| pattern = r'\b' + re.escape(abbreviation) + r'\b' | |
| standardized_content = re.sub(pattern, f"{abbreviation} ({full_form})", | |
| standardized_content, flags=re.IGNORECASE) | |
| return standardized_content | |
| def extract_medical_concepts(self, entities: List[MedicalEntity]) -> Dict[str, List[str]]: | |
| """ | |
| Group medical entities into conceptual categories from source documents | |
| """ | |
| concepts = defaultdict(list) | |
| for entity in entities: | |
| concepts[entity.entity_type].append(entity.text) | |
| # Remove duplicates and sort | |
| for concept_type in concepts: | |
| concepts[concept_type] = sorted(list(set(concepts[concept_type]))) | |
| return dict(concepts) | |
| def detect_evidence_level(self, content: str) -> Optional[str]: | |
| """ | |
| Detect evidence level mentioned in the source document | |
| """ | |
| for pattern in MEDICAL_ENTITY_PATTERNS['evidence_levels']: | |
| match = re.search(pattern, content, re.IGNORECASE) | |
| if match: | |
| return match.group() | |
| return None | |
| def enhance_medical_context(self, content: str, source_document: str, | |
| metadata: Dict[str, Any] = None) -> EnhancedMedicalContext: | |
| """ | |
| Main method to enhance medical context while maintaining source boundaries | |
| """ | |
| self.logger.info(f"π₯ Enhancing medical context for: {source_document}") | |
| # Extract medical entities from provided document only | |
| medical_entities = self.extract_medical_entities(content, source_document) | |
| # Extract clinical relationships within the same document | |
| clinical_relationships = self.extract_clinical_relationships( | |
| content, source_document, medical_entities | |
| ) | |
| # Standardize terminology based on source document mappings | |
| standardized_content = self.standardize_medical_terminology(content) | |
| # Group entities into medical concepts | |
| medical_concepts = self.extract_medical_concepts(medical_entities) | |
| # Detect evidence level if mentioned in source | |
| evidence_level = self.detect_evidence_level(content) | |
| # Create enhanced medical context | |
| enhanced_context = EnhancedMedicalContext( | |
| original_content=content, | |
| medical_entities=medical_entities, | |
| clinical_relationships=clinical_relationships, | |
| medical_concepts=medical_concepts, | |
| terminology_mappings=self.medical_terminology_db, | |
| evidence_level=evidence_level, | |
| source_metadata=metadata or {} | |
| ) | |
| self.logger.info(f"β Enhanced medical context created: " | |
| f"{len(medical_entities)} entities, {len(clinical_relationships)} relationships") | |
| return enhanced_context | |
| def _calculate_entity_confidence(self, entity_text: str, context_line: str) -> float: | |
| """Calculate confidence score for medical entity extraction""" | |
| confidence = 0.5 # Base confidence | |
| # Higher confidence for entities with dosage/numerical information | |
| if re.search(r'\d+', entity_text): | |
| confidence += 0.2 | |
| # Higher confidence for entities in clinical context | |
| clinical_context_indicators = ['patient', 'treatment', 'diagnosis', 'management'] | |
| if any(indicator in context_line.lower() for indicator in clinical_context_indicators): | |
| confidence += 0.2 | |
| # Higher confidence for known medical abbreviations | |
| if entity_text.upper() in self.medical_terminology_db: | |
| confidence += 0.1 | |
| return min(confidence, 1.0) | |
| def _calculate_relationship_confidence(self, pattern: str, context_line: str) -> float: | |
| """Calculate confidence score for clinical relationship extraction""" | |
| confidence = 0.6 # Base confidence for relationships | |
| # Higher confidence for explicit relationship words | |
| explicit_indicators = ['indicated', 'contraindicated', 'treatment', 'management'] | |
| if any(indicator in context_line.lower() for indicator in explicit_indicators): | |
| confidence += 0.2 | |
| # Higher confidence for dosage-related relationships | |
| if re.search(r'\d+(?:\.\d+)?\s*(?:mg|g|ml|units)', context_line): | |
| confidence += 0.1 | |
| return min(confidence, 1.0) | |
| def test_enhanced_medical_context(): | |
| """Test the enhanced medical context preparation system""" | |
| print("π§ͺ Testing Enhanced Medical Context Preparation System") | |
| # Sample Sri Lankan clinical guideline content | |
| test_content = """ | |
| Management of Preeclampsia in Pregnancy: | |
| Preeclampsia is diagnosed when blood pressure β₯140/90 mmHg with proteinuria after 20 weeks. | |
| Severe features include BP β₯160/110 mmHg, severe headache, visual disturbances. | |
| Treatment Protocol: | |
| - Administer magnesium sulfate (MgSO4) 4g IV bolus for seizure prophylaxis | |
| - Control BP with methyldopa 250mg orally every 8 hours | |
| - Monitor fetal heart rate with CTG every 4 hours | |
| Evidence Level: Expert consensus based on Sri Lankan clinical experience. | |
| Contraindicated: Do not use ACE inhibitors in pregnancy. | |
| """ | |
| enhancer = MedicalContextEnhancer() | |
| enhanced = enhancer.enhance_medical_context( | |
| content=test_content, | |
| source_document="SL-Preeclampsia-Guidelines-2024.md", | |
| metadata={"specialty": "Obstetrics", "country": "Sri Lanka"} | |
| ) | |
| print(f"\nπ Enhancement Results:") | |
| print(f" Medical Entities: {len(enhanced.medical_entities)}") | |
| print(f" Clinical Relationships: {len(enhanced.clinical_relationships)}") | |
| print(f" Medical Concepts: {len(enhanced.medical_concepts)}") | |
| print(f" Evidence Level: {enhanced.evidence_level}") | |
| print(f"\nπ₯ Medical Entities Found:") | |
| for entity in enhanced.medical_entities[:5]: # Show first 5 | |
| print(f" - {entity.text} ({entity.entity_type}) - Confidence: {entity.confidence:.2f}") | |
| print(f"\nπ Clinical Relationships:") | |
| for relationship in enhanced.clinical_relationships[:3]: # Show first 3 | |
| print(f" - {relationship.entity1} --{relationship.relationship_type}--> {relationship.entity2}") | |
| print(f"\nβ Enhanced Medical Context Preparation Test Completed") | |
| if __name__ == "__main__": | |
| test_enhanced_medical_context() |