File size: 15,620 Bytes
01f0120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
#!/usr/bin/env python3
"""
Enhanced Medical Context Preparation System
VedaMD Medical RAG - Phase 2: Task 2.1

This module enhances medical context preparation by:
1. Extracting medical entities from PROVIDED documents only
2. Creating clinical relationship mappings within source guidelines
3. Standardizing medical terminology from source documents
4. Adding medical metadata while maintaining strict context boundaries

CRITICAL SAFETY PROTOCOL:
- NO external medical knowledge injection
- ALL enhancements derived from provided Sri Lankan guidelines
- Maintains 100% source traceability for regulatory compliance
"""

import re
import logging
from typing import List, Dict, Set, Tuple, Optional, Any
from dataclasses import dataclass
from pathlib import Path
import json
from collections import defaultdict, Counter

# Medical entity patterns for Sri Lankan clinical guidelines
MEDICAL_ENTITY_PATTERNS = {
    # Medications and dosages (extract from provided context only)
    'medications': [
        r'\b(?:magnesium sulfate|MgSO4|oxytocin|methyldopa|nifedipine|labetalol|hydralazine)\b',
        r'\b(?:ampicillin|gentamicin|ceftriaxone|azithromycin|doxycycline)\b',
        r'\b(?:insulin|metformin|glibenclamide|aspirin|atorvastatin)\b'
    ],
    'dosages': [
        r'\b\d+(?:\.\d+)?\s*(?:mg|g|ml|units?|tablets?|caps)\b',
        r'\b(?:low|moderate|high|maximum|minimum)\s+dose\b'
    ],
    # Clinical conditions (from Sri Lankan guidelines)
    'conditions': [
        r'\b(?:preeclampsia|eclampsia|HELLP syndrome|gestational hypertension)\b',
        r'\b(?:postpartum hemorrhage|PPH|retained placenta|uterine atony)\b',
        r'\b(?:puerperal sepsis|endometritis|wound infection)\b',
        r'\b(?:gestational diabetes|GDM|diabetes mellitus)\b'
    ],
    # Procedures and interventions
    'procedures': [
        r'\b(?:cesarean section|C-section|vaginal delivery|assisted delivery)\b',
        r'\b(?:blood pressure monitoring|fetal monitoring|CTG)\b',
        r'\b(?:IV access|urinary catheter|nasogastric tube)\b'
    ],
    # Vital signs and measurements
    'vitals': [
        r'\b(?:blood pressure|BP)\s*(?:β‰₯|>=|>|<|≀|<=)?\s*\d+/\d+\s*mmHg\b',
        r'\b(?:heart rate|HR|pulse)\s*\d+\s*bpm\b',
        r'\b(?:temperature|temp)\s*\d+(?:\.\d+)?\s*Β°?[CF]\b'
    ],
    # Evidence levels (from Sri Lankan guidelines)
    'evidence_levels': [
        r'\b(?:Level I|Level II|Level III|Grade A|Grade B|Grade C)\s+evidence\b',
        r'\b(?:Expert consensus|Clinical recommendation|Strong recommendation)\b'
    ]
}

# Clinical relationship indicators (extract relationships from provided docs)
CLINICAL_RELATIONSHIPS = {
    'causes': [r'\bcause[sd]?\s+by\b', r'\bdue to\b', r'\bresult[s]?\s+from\b'],
    'treatments': [r'\btreat(?:ed|ment)?\s+with\b', r'\bmanage[d]?\s+with\b', r'\badminister\b'],
    'contraindications': [r'\bcontraindicated\b', r'\bavoid\b', r'\bdo not\b'],
    'indications': [r'\bindicated\s+for\b', r'\brecommended\s+for\b', r'\bused\s+to\s+treat\b'],
    'side_effects': [r'\bside effects?\b', r'\badverse\s+(?:effects?|reactions?)\b', r'\bcomplications?\b'],
    'dosing': [r'\bdose\b', r'\bdosage\b', r'\badminister\s+\d+', r'\bgive\s+\d+']
}

@dataclass
class MedicalEntity:
    """Medical entity extracted from source documents"""
    text: str
    entity_type: str
    source_document: str
    context: str
    confidence: float
    line_number: Optional[int] = None

@dataclass
class ClinicalRelationship:
    """Clinical relationship between medical entities within source documents"""
    entity1: str
    relationship_type: str
    entity2: str
    source_document: str
    context: str
    confidence: float

@dataclass
class EnhancedMedicalContext:
    """Enhanced medical context with entities and relationships from source docs"""
    original_content: str
    medical_entities: List[MedicalEntity]
    clinical_relationships: List[ClinicalRelationship]
    medical_concepts: Dict[str, List[str]]
    terminology_mappings: Dict[str, str]
    evidence_level: Optional[str]
    source_metadata: Dict[str, Any]

class MedicalContextEnhancer:
    """
    Enhanced medical context preparation system that maintains strict source boundaries
    """
    
    def __init__(self):
        self.setup_logging()
        self.medical_terminology_db = self._load_terminology_mappings()
        
    def setup_logging(self):
        """Setup logging for medical context enhancement"""
        logging.basicConfig(level=logging.INFO, 
                          format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger(__name__)
        
    def _load_terminology_mappings(self) -> Dict[str, str]:
        """Load medical terminology mappings from processed documents"""
        # This will be populated from actual Sri Lankan guideline terminology
        return {
            # Abbreviation mappings found in source documents
            'PPH': 'Postpartum Hemorrhage',
            'MgSO4': 'Magnesium Sulfate', 
            'BP': 'Blood Pressure',
            'CTG': 'Cardiotocography',
            'IV': 'Intravenous',
            'GDM': 'Gestational Diabetes Mellitus',
            'HELLP': 'Hemolysis, Elevated Liver enzymes, Low Platelets'
        }
    
    def extract_medical_entities(self, content: str, source_document: str) -> List[MedicalEntity]:
        """
        Extract medical entities from provided document content only
        """
        entities = []
        lines = content.split('\n')
        
        for line_num, line in enumerate(lines, 1):
            line_lower = line.lower()
            
            for entity_type, patterns in MEDICAL_ENTITY_PATTERNS.items():
                for pattern in patterns:
                    matches = re.finditer(pattern, line, re.IGNORECASE)
                    for match in matches:
                        entity = MedicalEntity(
                            text=match.group(),
                            entity_type=entity_type,
                            source_document=source_document,
                            context=line.strip(),
                            confidence=self._calculate_entity_confidence(match.group(), line),
                            line_number=line_num
                        )
                        entities.append(entity)
                        
        self.logger.info(f"Extracted {len(entities)} medical entities from {source_document}")
        return entities
    
    def extract_clinical_relationships(self, content: str, source_document: str, 
                                     entities: List[MedicalEntity]) -> List[ClinicalRelationship]:
        """
        Extract clinical relationships between entities within the same source document
        """
        relationships = []
        lines = content.split('\n')
        
        # Create entity lookup for this document
        entity_texts = {entity.text.lower() for entity in entities 
                       if entity.source_document == source_document}
        
        for line_num, line in enumerate(lines, 1):
            line_lower = line.lower()
            
            # Find entities in this line
            line_entities = [text for text in entity_texts if text in line_lower]
            
            if len(line_entities) >= 2:
                # Look for relationship indicators
                for rel_type, patterns in CLINICAL_RELATIONSHIPS.items():
                    for pattern in patterns:
                        if re.search(pattern, line, re.IGNORECASE):
                            # Create relationships between entities in this line
                            for i, entity1 in enumerate(line_entities):
                                for entity2 in line_entities[i+1:]:
                                    relationship = ClinicalRelationship(
                                        entity1=entity1,
                                        relationship_type=rel_type,
                                        entity2=entity2,
                                        source_document=source_document,
                                        context=line.strip(),
                                        confidence=self._calculate_relationship_confidence(pattern, line)
                                    )
                                    relationships.append(relationship)
                                    
        self.logger.info(f"Extracted {len(relationships)} clinical relationships from {source_document}")
        return relationships
    
    def standardize_medical_terminology(self, content: str) -> str:
        """
        Standardize medical terminology based on mappings found in source documents
        """
        standardized_content = content
        
        for abbreviation, full_form in self.medical_terminology_db.items():
            # Only standardize if both forms appear in the source documents
            pattern = r'\b' + re.escape(abbreviation) + r'\b'
            standardized_content = re.sub(pattern, f"{abbreviation} ({full_form})", 
                                        standardized_content, flags=re.IGNORECASE)
                                        
        return standardized_content
    
    def extract_medical_concepts(self, entities: List[MedicalEntity]) -> Dict[str, List[str]]:
        """
        Group medical entities into conceptual categories from source documents
        """
        concepts = defaultdict(list)
        
        for entity in entities:
            concepts[entity.entity_type].append(entity.text)
            
        # Remove duplicates and sort
        for concept_type in concepts:
            concepts[concept_type] = sorted(list(set(concepts[concept_type])))
            
        return dict(concepts)
    
    def detect_evidence_level(self, content: str) -> Optional[str]:
        """
        Detect evidence level mentioned in the source document
        """
        for pattern in MEDICAL_ENTITY_PATTERNS['evidence_levels']:
            match = re.search(pattern, content, re.IGNORECASE)
            if match:
                return match.group()
        return None
    
    def enhance_medical_context(self, content: str, source_document: str, 
                              metadata: Dict[str, Any] = None) -> EnhancedMedicalContext:
        """
        Main method to enhance medical context while maintaining source boundaries
        """
        self.logger.info(f"πŸ₯ Enhancing medical context for: {source_document}")
        
        # Extract medical entities from provided document only
        medical_entities = self.extract_medical_entities(content, source_document)
        
        # Extract clinical relationships within the same document
        clinical_relationships = self.extract_clinical_relationships(
            content, source_document, medical_entities
        )
        
        # Standardize terminology based on source document mappings
        standardized_content = self.standardize_medical_terminology(content)
        
        # Group entities into medical concepts
        medical_concepts = self.extract_medical_concepts(medical_entities)
        
        # Detect evidence level if mentioned in source
        evidence_level = self.detect_evidence_level(content)
        
        # Create enhanced medical context
        enhanced_context = EnhancedMedicalContext(
            original_content=content,
            medical_entities=medical_entities,
            clinical_relationships=clinical_relationships,
            medical_concepts=medical_concepts,
            terminology_mappings=self.medical_terminology_db,
            evidence_level=evidence_level,
            source_metadata=metadata or {}
        )
        
        self.logger.info(f"βœ… Enhanced medical context created: "
                        f"{len(medical_entities)} entities, {len(clinical_relationships)} relationships")
        
        return enhanced_context
    
    def _calculate_entity_confidence(self, entity_text: str, context_line: str) -> float:
        """Calculate confidence score for medical entity extraction"""
        confidence = 0.5  # Base confidence
        
        # Higher confidence for entities with dosage/numerical information
        if re.search(r'\d+', entity_text):
            confidence += 0.2
            
        # Higher confidence for entities in clinical context
        clinical_context_indicators = ['patient', 'treatment', 'diagnosis', 'management']
        if any(indicator in context_line.lower() for indicator in clinical_context_indicators):
            confidence += 0.2
            
        # Higher confidence for known medical abbreviations
        if entity_text.upper() in self.medical_terminology_db:
            confidence += 0.1
            
        return min(confidence, 1.0)
    
    def _calculate_relationship_confidence(self, pattern: str, context_line: str) -> float:
        """Calculate confidence score for clinical relationship extraction"""
        confidence = 0.6  # Base confidence for relationships
        
        # Higher confidence for explicit relationship words
        explicit_indicators = ['indicated', 'contraindicated', 'treatment', 'management']
        if any(indicator in context_line.lower() for indicator in explicit_indicators):
            confidence += 0.2
            
        # Higher confidence for dosage-related relationships
        if re.search(r'\d+(?:\.\d+)?\s*(?:mg|g|ml|units)', context_line):
            confidence += 0.1
            
        return min(confidence, 1.0)

def test_enhanced_medical_context():
    """Test the enhanced medical context preparation system"""
    print("πŸ§ͺ Testing Enhanced Medical Context Preparation System")
    
    # Sample Sri Lankan clinical guideline content
    test_content = """
    Management of Preeclampsia in Pregnancy:
    
    Preeclampsia is diagnosed when blood pressure β‰₯140/90 mmHg with proteinuria after 20 weeks.
    Severe features include BP β‰₯160/110 mmHg, severe headache, visual disturbances.
    
    Treatment Protocol:
    - Administer magnesium sulfate (MgSO4) 4g IV bolus for seizure prophylaxis
    - Control BP with methyldopa 250mg orally every 8 hours
    - Monitor fetal heart rate with CTG every 4 hours
    
    Evidence Level: Expert consensus based on Sri Lankan clinical experience.
    Contraindicated: Do not use ACE inhibitors in pregnancy.
    """
    
    enhancer = MedicalContextEnhancer()
    
    enhanced = enhancer.enhance_medical_context(
        content=test_content,
        source_document="SL-Preeclampsia-Guidelines-2024.md",
        metadata={"specialty": "Obstetrics", "country": "Sri Lanka"}
    )
    
    print(f"\nπŸ“Š Enhancement Results:")
    print(f"   Medical Entities: {len(enhanced.medical_entities)}")
    print(f"   Clinical Relationships: {len(enhanced.clinical_relationships)}")
    print(f"   Medical Concepts: {len(enhanced.medical_concepts)}")
    print(f"   Evidence Level: {enhanced.evidence_level}")
    
    print(f"\nπŸ₯ Medical Entities Found:")
    for entity in enhanced.medical_entities[:5]:  # Show first 5
        print(f"   - {entity.text} ({entity.entity_type}) - Confidence: {entity.confidence:.2f}")
    
    print(f"\nπŸ”— Clinical Relationships:")
    for relationship in enhanced.clinical_relationships[:3]:  # Show first 3
        print(f"   - {relationship.entity1} --{relationship.relationship_type}--> {relationship.entity2}")
    
    print(f"\nβœ… Enhanced Medical Context Preparation Test Completed")

if __name__ == "__main__":
    test_enhanced_medical_context()