Spaces:

ybchen928
/

oncall-guide-ai

Sleeping

App Files Files Community

Yan-Bo Chen commited on Jul 27

Commit

cd2cfdd

2 Parent(s): 21e5bc6 1e4fdbf

Merge pull request #1 from YanBoChen0928/dataprocessing

Browse files

s/p Data preprocessing, -> go for processing, embedding...

Files changed (28) hide show

.gitignore +1 -1
dataset/analysis/integrity_check/integrity_check_report.json +29 -0
dataset/analysis/keyword_matching_test_results.json +151 -0
dataset/analysis/stats/analysis_stats_emergency_subset.json +55 -0
dataset/analysis/stats/analysis_stats_emergency_subset_opt.json +55 -0
dataset/analysis/subset_comparison_first10_records_20250726_163149.md +198 -0
dataset/analysis/subset_comparison_first10_records_20250726_163158.md +198 -0
dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json +293 -0
dataset/check_source.py +18 -0
dataset/filter_guidelines.py +31 -0
dataset/keywords/emergency_keywords.txt +47 -0
dataset/keywords/special_terms_emergency.json +26 -0
dataset/keywords/special_terms_treatment.json +25 -0
dataset/keywords/treatment_keywords.txt +105 -0
dataset/scripts/01_filter_emergency.py +58 -0
dataset/scripts/01_filter_emergency_opt.py +112 -0
dataset/scripts/02_filter_treatment.py +103 -0
dataset/scripts/02_filter_treatment_opt.py +131 -0
dataset/scripts/check_subset_integrity.py +178 -0
dataset/scripts/commit_message_20250726_special_terms.txt +39 -0
dataset/scripts/compare_subsets_opt.py +124 -0
dataset/scripts/data_explorer.py +123 -0
dataset/scripts/data_explorer_opt.py +118 -0
dataset/scripts/data_explorer_treatment.py +265 -0
dataset/scripts/data_explorer_treatment_opt.py +262 -0
dataset/scripts/keyword_Match_Clean_for_subset_filter.txt +85 -0
dataset/scripts/test_keyword_matching.py +175 -0
requirements.txt +7 -0

.gitignore CHANGED Viewed

@@ -1,4 +1,4 @@
-dataset/
 #virtual environment
 genAIvenv/

+dataset/dataset/
 #virtual environment
 genAIvenv/

dataset/analysis/integrity_check/integrity_check_report.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "sample_analysis": {
+    "matched": {
+      "non_null": 100,
+      "non_empty": 100,
+      "unique_values": 84
+    },
+    "treatment_matched": {
+      "non_null": 100,
+      "non_empty": 100,
+      "unique_values": 100
+    }
+  },
+  "full_file_analysis": {
+    "total_records": 9367,
+    "matched_column": {
+      "non_null_count": 9367,
+      "non_empty_count": 9367,
+      "null_percentage": 0.0
+    },
+    "treatment_matched_column": {
+      "non_null_count": 9367,
+      "non_empty_count": 9367,
+      "null_percentage": 0.0
+    },
+    "both_matched_count": 3315,
+    "both_matched_percentage": 35.39019963702359
+  }
+}

dataset/analysis/keyword_matching_test_results.json ADDED Viewed

	@@ -0,0 +1,151 @@

+{
+  "special_terms_matching": [
+    {
+      "clean_text": "Patient needs an x-ray of the chest",
+      "category": "x-ray variants",
+      "matched": "x-ray"
+    },
+    {
+      "clean_text": "Ordered chest xray",
+      "category": "x-ray variants",
+      "matched": "xray"
+    },
+    {
+      "clean_text": "X ray shows pneumonia",
+      "category": "x-ray variants",
+      "matched": "X ray"
+    },
+    {
+      "clean_text": "XRAY negative",
+      "category": "x-ray variants",
+      "matched": "XRAY"
+    },
+    {
+      "clean_text": "CT scan reveals nodule",
+      "category": "ct-scan variants",
+      "matched": "CT scan"
+    },
+    {
+      "clean_text": "CT-scan indicates mass",
+      "category": "ct-scan variants",
+      "matched": "CT-scan"
+    },
+    {
+      "clean_text": "Requires ctscan urgently",
+      "category": "ct-scan variants",
+      "matched": "ctscan"
+    },
+    {
+      "clean_text": "CTSCAN of abdomen",
+      "category": "ct-scan variants",
+      "matched": "CTSCAN"
+    },
+    {
+      "clean_text": "Point-of-care testing needed",
+      "category": "point-of-care variants",
+      "matched": "Point-of-care"
+    },
+    {
+      "clean_text": "Point of care ultrasound",
+      "category": "point-of-care variants",
+      "matched": "Point of care"
+    },
+    {
+      "clean_text": "POC testing results",
+      "category": "point-of-care variants",
+      "matched": ""
+    },
+    {
+      "clean_text": "Ordered both x-ray and CT scan",
+      "category": "mixed cases",
+      "matched": "x-ray|CT scan"
+    },
+    {
+      "clean_text": "XRAY and CTSCAN negative",
+      "category": "mixed cases",
+      "matched": "XRAY|CTSCAN"
+    },
+    {
+      "clean_text": "Multiple point-of-care tests with x-ray",
+      "category": "mixed cases",
+      "matched": "point-of-care|x-ray"
+    },
+    {
+      "clean_text": "No imaging mentioned",
+      "category": "negative cases",
+      "matched": ""
+    },
+    {
+      "clean_text": "Regular examination only",
+      "category": "negative cases",
+      "matched": ""
+    },
+    {
+      "clean_text": "Laboratory tests pending",
+      "category": "negative cases",
+      "matched": ""
+    }
+  ],
+  "basic_matching": [
+    {
+      "clean_text": "Emergency treatment required",
+      "category": "simple matches",
+      "matched": "Emergency"
+    },
+    {
+      "clean_text": "Acute condition observed",
+      "category": "simple matches",
+      "matched": "Acute"
+    },
+    {
+      "clean_text": "Urgent care needed",
+      "category": "simple matches",
+      "matched": "Urgent"
+    },
+    {
+      "clean_text": "EMERGENCY situation",
+      "category": "case variations",
+      "matched": "EMERGENCY"
+    },
+    {
+      "clean_text": "Acute RESPIRATORY failure",
+      "category": "case variations",
+      "matched": "Acute"
+    },
+    {
+      "clean_text": "URgent surgical intervention",
+      "category": "case variations",
+      "matched": "URgent"
+    },
+    {
+      "clean_text": "Emergency treatment for acute condition",
+      "category": "multiple matches",
+      "matched": "Emergency|acute"
+    },
+    {
+      "clean_text": "Urgent care in emergency department",
+      "category": "multiple matches",
+      "matched": "Urgent|emergency"
+    },
+    {
+      "clean_text": "Acute respiratory emergency",
+      "category": "multiple matches",
+      "matched": "Acute|emergency"
+    },
+    {
+      "clean_text": "Non-emergency situation",
+      "category": "partial words",
+      "matched": "emergency"
+    },
+    {
+      "clean_text": "Subacute condition",
+      "category": "partial words",
+      "matched": ""
+    },
+    {
+      "clean_text": "Emergency-related",
+      "category": "partial words",
+      "matched": "Emergency"
+    }
+  ]
+}

dataset/analysis/stats/analysis_stats_emergency_subset.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "basic_statistics": {
+    "total_records": 10282,
+    "avg_length": 25185.078194903715
+  },
+  "keyword_statistics": {
+    "Acute abdomen": 52,
+    "Acute bleeding": 31,
+    "Acute Coronary Syndrome": 345,
+    "Acute Kidney Injury": 202,
+    "Acute pancreatitis": 214,
+    "Acute respiratory distress syndrome": 231,
+    "Acute stroke": 67,
+    "Anaphylaxis": 1016,
+    "Anaphylactic Shock": 153,
+    "Arrhythmia": 1547,
+    "Atrial fibrillation": 771,
+    "Atrial flutter": 146,
+    "Bradycardia": 884,
+    "Cardiac arrest": 614,
+    "Cardiogenic Shock": 196,
+    "Chest pain": 1433,
+    "Dyspnea": 1319,
+    "Fever": 4270,
+    "Gastrointestinal Hemorrhage": 158,
+    "GI bleeding": 105,
+    "Hemorrhage": 1611,
+    "Hemorrhagic stroke": 117,
+    "Hyperthermia": 305,
+    "Hypovolemic Shock": 63,
+    "Hypotension": 1929,
+    "Hypothermia": 356,
+    "Internal bleeding": 70,
+    "Intracranial Hemorrhages": 6,
+    "Ischemic stroke": 224,
+    "Loss of consciousness": 422,
+    "Myocardial Infarction": 1708,
+    "MI": 10183,
+    "Pulmonary Edema": 487,
+    "Pulmonary Embolism": 654,
+    "Respiratory distress": 730,
+    "Respiratory failure": 579,
+    "Sepsis": 1181,
+    "Severe Sepsis": 81,
+    "Septic Shock": 244,
+    "Shock": 1881,
+    "Status Epilepticus": 150,
+    "Syncope": 834,
+    "Tachycardia": 1650,
+    "Tachypnea": 268,
+    "Traumatic Brain Injury": 171,
+    "Ventricular Tachycardia": 491,
+    "Ventricular fibrillation": 295
+  }
+}

dataset/analysis/stats/analysis_stats_emergency_subset_opt.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "basic_statistics": {
+    "total_records": 11914,
+    "avg_length": 23847.07579318449
+  },
+  "keyword_statistics": {
+    "Acute abdomen": 52,
+    "Acute bleeding": 31,
+    "Acute Coronary Syndrome": 351,
+    "Acute Kidney Injury": 202,
+    "Acute pancreatitis": 214,
+    "Acute respiratory distress syndrome": 231,
+    "Acute stroke": 67,
+    "Anaphylaxis": 1016,
+    "Anaphylactic Shock": 153,
+    "Arrhythmia": 1564,
+    "Atrial fibrillation": 771,
+    "Atrial flutter": 146,
+    "Bradycardia": 884,
+    "Cardiac arrest": 614,
+    "Cardiogenic Shock": 196,
+    "Chest pain": 1434,
+    "Dyspnea": 1319,
+    "Fever": 4279,
+    "Gastrointestinal Hemorrhage": 158,
+    "GI bleeding": 105,
+    "Hemorrhage": 1621,
+    "Hemorrhagic stroke": 117,
+    "Hyperthermia": 305,
+    "Hypovolemic Shock": 63,
+    "Hypotension": 1929,
+    "Hypothermia": 356,
+    "Internal bleeding": 70,
+    "Intracranial Hemorrhages": 6,
+    "Ischemic stroke": 225,
+    "Loss of consciousness": 422,
+    "Myocardial Infarction": 1710,
+    "MI": 11773,
+    "Pulmonary Edema": 487,
+    "Pulmonary Embolism": 654,
+    "Respiratory distress": 730,
+    "Respiratory failure": 579,
+    "Sepsis": 1188,
+    "Severe Sepsis": 81,
+    "Septic Shock": 244,
+    "Shock": 1892,
+    "Status Epilepticus": 150,
+    "Syncope": 834,
+    "Tachycardia": 1651,
+    "Tachypnea": 268,
+    "Traumatic Brain Injury": 171,
+    "Ventricular Tachycardia": 492,
+    "Ventricular fibrillation": 295
+  }
+}

dataset/analysis/subset_comparison_first10_records_20250726_163149.md ADDED Viewed

	@@ -0,0 +1,198 @@

+# Optimized Subsets Comparison Report
+Generated on: 2025-07-26 16:31:49
+File format: CSV
+## Basic Statistics
+- Emergency subset total records: 11914
+- Emergency+Treatment subset total records: 11023
+- Avg Emergency Text Length: 23847.08
+- Avg Treatment Text Length: 25408.64
+- Avg Emergency Keywords: 2.85
+- Avg Treatment Keywords: 2.97
+## Emergency Subset (First 10 Records)
+### Record 1
+```
+Text preview: # Section 1: Recommendations
+# RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
+Matched keywords: shock
+Text length: 37792
+```
+### Record 2
+```
+Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
+Matched keywords: hemorrhage
+Text length: 7559
+```
+### Record 3
+```
+Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
+Matched keywords: ards|pulmonary embolism
+Text length: 11731
+```
+### Record 4
+```
+Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
+Matched keywords: fever|dyspnea|hypotension|sepsis
+Text length: 46087
+```
+### Record 5
+```
+Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
+Matched keywords: hyperthermia
+Text length: 35302
+```
+### Record 6
+```
+Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
+Matched keywords: hemorrhage|dyspnea
+Text length: 16186
+```
+### Record 7
+```
+Text preview: # GUIDELINE OBJECTIVES
+The objective of this guideline is to update a previous guideline on chemothe...
+Matched keywords: hemorrhage
+Text length: 7551
+```
+### Record 8
+```
+Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
+Matched keywords: mi
+Text length: 50729
+```
+### Record 9
+```
+Text preview: # GUIDELINE OBJECTIVE
+This guideline was written to provide guidance on the most appropriate follow-...
+Matched keywords: hemorrhage
+Text length: 4299
+```
+### Record 10
+```
+Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
+Matched keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
+Text length: 54427
+```
+## Emergency+Treatment Subset (First 10 Records)
+### Record 1
+```
+Text preview: # Section 1: Recommendations
+# RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
+Emergency keywords: shock
+Treatment keywords: management|medication|procedure|fluid|monitoring|iv|administer|dose
+Text length: 37792
+```
+### Record 2
+```
+Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
+Emergency keywords: hemorrhage
+Treatment keywords: Therapy|treatment|x-ray|us|ct
+Text length: 7559
+```
+### Record 3
+```
+Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
+Emergency keywords: ards|pulmonary embolism
+Treatment keywords: dopamine|therapy|treatment|surgery|iv|intervention|dose
+Text length: 11731
+```
+### Record 4
+```
+Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
+Emergency keywords: fever|dyspnea|hypotension|sepsis
+Treatment keywords: treatment|iv|therapy|treat|management|intervention|supportive care|dose
+Text length: 46087
+```
+### Record 5
+```
+Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
+Emergency keywords: hyperthermia
+Treatment keywords: surgery|treatment|therapy|treat|dose|ct
+Text length: 35302
+```
+### Record 6
+```
+Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
+Emergency keywords: hemorrhage|dyspnea
+Treatment keywords: therapy|management|treatment|morphine|dose
+Text length: 16186
+```
+### Record 7
+```
+Text preview: # GUIDELINE OBJECTIVES
+The objective of this guideline is to update a previous guideline on chemothe...
+Emergency keywords: hemorrhage
+Treatment keywords: therapy|treatment|surgery
+Text length: 7551
+```
+### Record 8
+```
+Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
+Emergency keywords: mi
+Treatment keywords: iv|Dose|therapy|administer|surgery|treatment|treat|medication|ecg
+Text length: 50729
+```
+### Record 9
+```
+Text preview: # GUIDELINE OBJECTIVE
+This guideline was written to provide guidance on the most appropriate follow-...
+Emergency keywords: hemorrhage
+Treatment keywords: treatment|ct
+Text length: 4299
+```
+### Record 10
+```
+Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
+Emergency keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
+Treatment keywords: treatment|oxygen|iv|dose|therapy|surgery|x-ray|administer|procedure|management
+Text length: 54427
+```

dataset/analysis/subset_comparison_first10_records_20250726_163158.md ADDED Viewed

	@@ -0,0 +1,198 @@

+# Optimized Subsets Comparison Report
+Generated on: 2025-07-26 16:31:58
+File format: JSONL
+## Basic Statistics
+- Emergency subset total records: 11914
+- Emergency+Treatment subset total records: 11023
+- Avg Emergency Text Length: 23847.08
+- Avg Treatment Text Length: 25408.64
+- Avg Emergency Keywords: 2.85
+- Avg Treatment Keywords: 2.97
+## Emergency Subset (First 10 Records)
+### Record 1
+```
+Text preview: # Section 1: Recommendations
+# RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
+Matched keywords: shock
+Text length: 37792
+```
+### Record 2
+```
+Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
+Matched keywords: hemorrhage
+Text length: 7559
+```
+### Record 3
+```
+Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
+Matched keywords: ards|pulmonary embolism
+Text length: 11731
+```
+### Record 4
+```
+Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
+Matched keywords: fever|dyspnea|hypotension|sepsis
+Text length: 46087
+```
+### Record 5
+```
+Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
+Matched keywords: hyperthermia
+Text length: 35302
+```
+### Record 6
+```
+Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
+Matched keywords: hemorrhage|dyspnea
+Text length: 16186
+```
+### Record 7
+```
+Text preview: # GUIDELINE OBJECTIVES
+The objective of this guideline is to update a previous guideline on chemothe...
+Matched keywords: hemorrhage
+Text length: 7551
+```
+### Record 8
+```
+Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
+Matched keywords: mi
+Text length: 50729
+```
+### Record 9
+```
+Text preview: # GUIDELINE OBJECTIVE
+This guideline was written to provide guidance on the most appropriate follow-...
+Matched keywords: hemorrhage
+Text length: 4299
+```
+### Record 10
+```
+Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
+Matched keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
+Text length: 54427
+```
+## Emergency+Treatment Subset (First 10 Records)
+### Record 1
+```
+Text preview: # Section 1: Recommendations
+# RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
+Emergency keywords: shock
+Treatment keywords: management|medication|procedure|fluid|monitoring|iv|administer|dose
+Text length: 37792
+```
+### Record 2
+```
+Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
+Emergency keywords: hemorrhage
+Treatment keywords: Therapy|treatment|x-ray|us|ct
+Text length: 7559
+```
+### Record 3
+```
+Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
+Emergency keywords: ards|pulmonary embolism
+Treatment keywords: dopamine|therapy|treatment|surgery|iv|intervention|dose
+Text length: 11731
+```
+### Record 4
+```
+Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
+Emergency keywords: fever|dyspnea|hypotension|sepsis
+Treatment keywords: treatment|iv|therapy|treat|management|intervention|supportive care|dose
+Text length: 46087
+```
+### Record 5
+```
+Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
+Emergency keywords: hyperthermia
+Treatment keywords: surgery|treatment|therapy|treat|dose|ct
+Text length: 35302
+```
+### Record 6
+```
+Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
+Emergency keywords: hemorrhage|dyspnea
+Treatment keywords: therapy|management|treatment|morphine|dose
+Text length: 16186
+```
+### Record 7
+```
+Text preview: # GUIDELINE OBJECTIVES
+The objective of this guideline is to update a previous guideline on chemothe...
+Emergency keywords: hemorrhage
+Treatment keywords: therapy|treatment|surgery
+Text length: 7551
+```
+### Record 8
+```
+Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
+Emergency keywords: mi
+Treatment keywords: iv|Dose|therapy|administer|surgery|treatment|treat|medication|ecg
+Text length: 50729
+```
+### Record 9
+```
+Text preview: # GUIDELINE OBJECTIVE
+This guideline was written to provide guidance on the most appropriate follow-...
+Emergency keywords: hemorrhage
+Treatment keywords: treatment|ct
+Text length: 4299
+```
+### Record 10
+```
+Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
+Emergency keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
+Treatment keywords: treatment|oxygen|iv|dose|therapy|surgery|x-ray|administer|procedure|management
+Text length: 54427
+```

dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json ADDED Viewed

	@@ -0,0 +1,293 @@

+{
+  "basic_statistics": {
+    "total_records": 9367,
+    "avg_text_length": 27179.22952919825,
+    "emergency_keywords_count": 47,
+    "treatment_keywords_count": 105
+  },
+  "emergency_keyword_stats": {
+    "Acute abdomen": 51,
+    "Acute bleeding": 31,
+    "Acute Coronary Syndrome": 332,
+    "Acute Kidney Injury": 200,
+    "Acute pancreatitis": 202,
+    "Acute respiratory distress syndrome": 225,
+    "Acute stroke": 65,
+    "Anaphylaxis": 1002,
+    "Anaphylactic Shock": 148,
+    "Arrhythmia": 1490,
+    "Atrial fibrillation": 736,
+    "Atrial flutter": 139,
+    "Bradycardia": 845,
+    "Cardiac arrest": 600,
+    "Cardiogenic Shock": 192,
+    "Chest pain": 1408,
+    "Dyspnea": 1296,
+    "Fever": 4008,
+    "Gastrointestinal Hemorrhage": 158,
+    "GI bleeding": 103,
+    "Hemorrhage": 1532,
+    "Hemorrhagic stroke": 109,
+    "Hyperthermia": 283,
+    "Hypovolemic Shock": 61,
+    "Hypotension": 1897,
+    "Hypothermia": 340,
+    "Internal bleeding": 67,
+    "Intracranial Hemorrhages": 5,
+    "Ischemic stroke": 216,
+    "Loss of consciousness": 406,
+    "Myocardial Infarction": 1607,
+    "MI": 9316,
+    "Pulmonary Edema": 471,
+    "Pulmonary Embolism": 624,
+    "Respiratory distress": 713,
+    "Respiratory failure": 554,
+    "Sepsis": 1145,
+    "Severe Sepsis": 81,
+    "Septic Shock": 231,
+    "Shock": 1702,
+    "Status Epilepticus": 149,
+    "Syncope": 806,
+    "Tachycardia": 1576,
+    "Tachypnea": 262,
+    "Traumatic Brain Injury": 151,
+    "Ventricular Tachycardia": 461,
+    "Ventricular fibrillation": 280
+  },
+  "treatment_keyword_stats": {
+    "ACLS": 30,
+    "administer": 3881,
+    "Adrenaline": 135,
+    "Advanced Cardiac Life Support": 34,
+    "Airway Management": 174,
+    "alpha blocker": 35,
+    "Amiodarone": 315,
+    "analgesia": 323,
+    "Anesthesia Procedural": 0,
+    "Anti-Bacterial Agents": 1,
+    "antibiotic": 1922,
+    "arterial line placement": 0,
+    "beta blocker": 297,
+    "Bi-level Positive Airway Pressure": 6,
+    "bipap": 25,
+    "Blood Transfusion": 379,
+    "Bosmin": 0,
+    "Cardiopulmonary Resuscitation": 131,
+    "Cardioversion": 142,
+    "Catheterization Arterial": 0,
+    "Catheterization Central Venous": 0,
+    "central line placement": 6,
+    "compression dressing": 2,
+    "Computed Tomography": 518,
+    "cpap": 84,
+    "cpr": 151,
+    "crystalloids": 45,
+    "ct scan": 1036,
+    "Defibrillation": 96,
+    "Dopamine": 389,
+    "Dosage Forms": 210,
+    "dose": 5344,
+    "Drug Administration Routes": 0,
+    "Drug Therapy": 773,
+    "Epinephrine": 806,
+    "fluid": 2938,
+    "fluid resuscitation": 115,
+    "hemodynamic monitoring": 43,
+    "Hemodynamics": 135,
+    "Hemostasis": 180,
+    "Ibuprofen": 269,
+    "icu transfer": 9,
+    "Insulin": 808,
+    "intervention": 2695,
+    "intubation": 493,
+    "Intratracheal Intubation": 3,
+    "Intravenous Infusion": 576,
+    "iv fluids": 75,
+    "laboratory techniques": 29,
+    "laboratory testing": 296,
+    "levophed": 11,
+    "Lidocaine": 212,
+    "manage": 4416,
+    "management": 4008,
+    "medication": 4698,
+    "midazolam": 204,
+    "monitor": 4521,
+    "monitoring": 3593,
+    "Morphine": 289,
+    "Nebulization": 41,
+    "nitroglycerin": 125,
+    "NTG": 81,
+    "Norepinephrine": 392,
+    "normal saline": 252,
+    "Ondansetron": 43,
+    "Oxygen": 1779,
+    "Oxygen Inhalation Therapy": 2,
+    "oxygen therapy": 178,
+    "Patient Management": 281,
+    "Patient Monitoring": 107,
+    "POCUS": 10,
+    "point of care ultrasound": 2,
+    "procedural sedation": 26,
+    "procedure": 3073,
+    "radiologic imaging": 5,
+    "Radiography": 218,
+    "resuscitation": 539,
+    "Sedation": 602,
+    "splinting": 26,
+    "Splints": 29,
+    "supportive care": 564,
+    "surgical procedures": 482,
+    "Surgical Procedures Operative": 0,
+    "surgery": 3531,
+    "Suture": 179,
+    "Suturing": 53,
+    "Therapeutic Intervention": 181,
+    "Therapeutics": 182,
+    "Therapy": 6117,
+    "tourniquet": 56,
+    "transfusion": 826,
+    "treat": 8270,
+    "treatment": 7719,
+    "Ultrasonography Point of Care": 0,
+    "ultrasound": 1273,
+    "Vasoconstrictor Agents": 2,
+    "vasopressors": 188,
+    "ventilation support": 14,
+    "Ventilators": 86,
+    "Vital Signs": 459,
+    "vital signs monitoring": 1,
+    "wound care": 73,
+    "Wound Dressing": 30,
+    "Wound Management": 37,
+    "X-Ray": 1293
+  },
+  "cooccurrence_analysis": [
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 3488,
+      "percentage": 37.23710899967973
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "Therapy",
+      "cooccurrence_count": 2698,
+      "percentage": 28.803245436105477
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "dose",
+      "cooccurrence_count": 2430,
+      "percentage": 25.94213729048788
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "medication",
+      "cooccurrence_count": 1979,
+      "percentage": 21.127362015586634
+    },
+    {
+      "emergency_keyword": "Hypotension",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1760,
+      "percentage": 18.789366926443897
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "management",
+      "cooccurrence_count": 1753,
+      "percentage": 18.714636489804633
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "treat",
+      "cooccurrence_count": 1744,
+      "percentage": 18.618554499839863
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "monitoring",
+      "cooccurrence_count": 1674,
+      "percentage": 17.87125013344721
+    },
+    {
+      "emergency_keyword": "Hypotension",
+      "treatment_keyword": "Therapy",
+      "cooccurrence_count": 1558,
+      "percentage": 16.63286004056795
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "surgery",
+      "cooccurrence_count": 1505,
+      "percentage": 16.06704387744208
+    },
+    {
+      "emergency_keyword": "Tachycardia",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1441,
+      "percentage": 15.383794171025942
+    },
+    {
+      "emergency_keyword": "Hypotension",
+      "treatment_keyword": "dose",
+      "cooccurrence_count": 1423,
+      "percentage": 15.191630191096403
+    },
+    {
+      "emergency_keyword": "Myocardial Infarction",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1369,
+      "percentage": 14.615138251307783
+    },
+    {
+      "emergency_keyword": "Shock",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1340,
+      "percentage": 14.305540728087967
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "fluid",
+      "cooccurrence_count": 1330,
+      "percentage": 14.198782961460447
+    },
+    {
+      "emergency_keyword": "Hemorrhage",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1328,
+      "percentage": 14.177431408134941
+    },
+    {
+      "emergency_keyword": "Hypotension",
+      "treatment_keyword": "monitoring",
+      "cooccurrence_count": 1325,
+      "percentage": 14.145404078146683
+    },
+    {
+      "emergency_keyword": "Tachycardia",
+      "treatment_keyword": "Therapy",
+      "cooccurrence_count": 1277,
+      "percentage": 13.632966798334579
+    },
+    {
+      "emergency_keyword": "Dyspnea",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1228,
+      "percentage": 13.10985374185972
+    },
+    {
+      "emergency_keyword": "Myocardial Infarction",
+      "treatment_keyword": "Therapy",
+      "cooccurrence_count": 1215,
+      "percentage": 12.97106864524394
+    }
+  ],
+  "path_b_validation": {
+    "avg_emergency_density": 0.3098621434407273,
+    "avg_treatment_density": 0.6108515041451529,
+    "high_density_records": 1298,
+    "precision_estimate": 0.9995729689334899
+  },
+  "condition_mapping_candidates": {}
+}

dataset/check_source.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import pandas as pd
+# 讀取剛剛下載並過濾後的 JSONL 檔案
+df = pd.read_json("dataset/guidelines_source_filtered.jsonl", lines=True)
+# 顯示各來源出現次數
+print("📊 各來源出現次數：")
+print(df["source"].value_counts())
+# 驗證來源是否只有指定的 9 個
+expected_sources = {"cco", "cdc", "cma", "icrc", "nice", "pubmed", "spor", "who", "wikidoc"}
+actual_sources = set(df["source"].unique())
+# 顯示驗證結果
+if actual_sources == expected_sources:
+    print("✅ 來源完全符合預期，沒有其他來源。")
+else:
+    print(f"❌ 發現未預期來源：{actual_sources - expected_sources}")

dataset/filter_guidelines.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# filter_guidelines.py
+from datasets import load_dataset
+import pandas as pd
+import os
+# ✅ 你信任的來源來源縮寫（Hugging Face dataset 中的 source 欄位）
+approved_sources = ["cco", "cdc", "cma", "icrc", "nice", "pubmed", "spor", "who", "wikidoc"]
+# Step 1: 從 Hugging Face 載入資料集
+print("⏳ 載入資料中...")
+ds = load_dataset("epfl-llm/guidelines", split="train")
+# Step 2: 依據 source 欄位進行過濾
+print("🔍 篩選可信來源中...")
+ds_filtered = ds.filter(lambda ex: ex["source"] in approved_sources)
+print(f"✅ 篩選完成，總共 {len(ds_filtered)} 筆資料。")
+# Step 3: 轉成 pandas DataFrame
+print("📄 轉換為 DataFrame...")
+df = ds_filtered.to_pandas()
+# Step 4: 建立 dataset 資料夾（如果不存在）
+os.makedirs("dataset", exist_ok=True)
+# Step 5: 儲存為 JSONL 與 CSV 到 dataset/ 資料夾中
+print("💾 儲存到 dataset/ 資料夾...")
+df.to_json("dataset/guidelines_source_filtered.jsonl", orient="records", lines=True)
+df.to_csv("dataset/guidelines_source_filtered.csv", index=False)
+print("🎉 完成！已儲存來自可信來源的資料。")

dataset/keywords/emergency_keywords.txt ADDED Viewed

	@@ -0,0 +1,47 @@

+Acute abdomen
+Acute bleeding
+Acute Coronary Syndrome
+Acute Kidney Injury
+Acute pancreatitis
+Acute respiratory distress syndrome
+Acute stroke
+Anaphylaxis
+Anaphylactic Shock
+Arrhythmia
+Atrial fibrillation
+Atrial flutter
+Bradycardia
+Cardiac arrest
+Cardiogenic Shock
+Chest pain
+Dyspnea
+Fever
+Gastrointestinal Hemorrhage
+GI bleeding
+Hemorrhage
+Hemorrhagic stroke
+Hyperthermia
+Hypovolemic Shock
+Hypotension
+Hypothermia
+Internal bleeding
+Intracranial Hemorrhages
+Ischemic stroke
+Loss of consciousness
+Myocardial Infarction
+MI
+Pulmonary Edema
+Pulmonary Embolism
+Respiratory distress
+Respiratory failure
+Sepsis
+Severe Sepsis
+Septic Shock
+Shock
+Status Epilepticus
+Syncope
+Tachycardia
+Tachypnea
+Traumatic Brain Injury
+Ventricular Tachycardia
+Ventricular fibrillation

dataset/keywords/special_terms_emergency.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "cardiac": {
+        "mi": ["mi", "m.i.", "myocardial infarction", "MI"],
+        "acs": ["acs", "ACS", "acute coronary syndrome"]
+    },
+    "respiratory": {
+        "ards": ["ards", "ARDS", "acute respiratory distress syndrome"],
+        "respiratory_failure": ["respiratory failure", "resp failure", "RF"]
+    },
+    "neurological": {
+        "loc": ["loc", "LOC", "loss of consciousness"],
+        "cva": ["cva", "CVA", "stroke", "cerebrovascular accident"]
+    },
+    "shock": {
+        "shock": ["shock", "circulatory failure"],
+        "septic_shock": ["septic shock", "sepsis induced shock"]
+    },
+    "bleeding": {
+        "gi_bleed": ["gi bleed", "gi bleeding", "gastrointestinal hemorrhage", "GI hemorrhage"],
+        "hemorrhage": ["hemorrhage", "bleeding", "blood loss"]
+    },
+    "vital_signs": {
+        "hypotension": ["hypotension", "low bp", "low blood pressure"],
+        "tachycardia": ["tachycardia", "elevated heart rate", "fast heart rate"]
+    }
+}

dataset/keywords/special_terms_treatment.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "imaging": {
+        "x-ray": ["x-ray", "x ray", "xray", "XR"],
+        "ct": ["ct", "ct-scan", "cat scan", "computed tomography"],
+        "us": ["us", "u/s", "ultrasound", "sonography"]
+    },
+    "medications": {
+        "iv": ["iv", "i.v.", "intravenous"],
+        "im": ["im", "i.m.", "intramuscular"],
+        "po": ["po", "p.o.", "per os", "by mouth"]
+    },
+    "procedures": {
+        "cpr": ["cpr", "CPR", "cardiopulmonary resuscitation"],
+        "intubation": ["intubation", "ETT", "endotracheal tube"],
+        "cardioversion": ["cardioversion", "electrical cardioversion"]
+    },
+    "monitoring": {
+        "ecg": ["ecg", "ekg", "electrocardiogram"],
+        "monitoring": ["monitoring", "continuous observation"]
+    },
+    "ventilation": {
+        "bipap": ["bipap", "BiPAP", "bi-level positive airway pressure"],
+        "cpap": ["cpap", "CPAP", "continuous positive airway pressure"]
+    }
+}

dataset/keywords/treatment_keywords.txt ADDED Viewed

	@@ -0,0 +1,105 @@

+ACLS
+administer
+Adrenaline
+Advanced Cardiac Life Support
+Airway Management
+alpha blocker
+Amiodarone
+analgesia
+Anesthesia Procedural
+Anti-Bacterial Agents
+antibiotic
+arterial line placement
+beta blocker
+Bi-level Positive Airway Pressure
+bipap
+Blood Transfusion
+Bosmin
+Cardiopulmonary Resuscitation
+Cardioversion
+Catheterization Arterial
+Catheterization Central Venous
+central line placement
+compression dressing
+Computed Tomography
+cpap
+cpr
+crystalloids
+ct scan
+Defibrillation
+Dopamine
+Dosage Forms
+dose
+Drug Administration Routes
+Drug Therapy
+Epinephrine
+fluid
+fluid resuscitation
+hemodynamic monitoring
+Hemodynamics
+Hemostasis
+Ibuprofen
+icu transfer
+Insulin
+intervention
+intubation
+Intratracheal Intubation
+Intravenous Infusion
+iv fluids
+laboratory techniques
+laboratory testing
+levophed
+Lidocaine
+manage
+management
+medication
+midazolam
+monitor
+monitoring
+Morphine
+Nebulization
+nitroglycerin
+NTG
+Norepinephrine
+normal saline
+Ondansetron
+Oxygen
+Oxygen Inhalation Therapy
+oxygen therapy
+Patient Management
+Patient Monitoring
+POCUS
+point of care ultrasound
+procedural sedation
+procedure
+radiologic imaging
+Radiography
+resuscitation
+Sedation
+splinting
+Splints
+supportive care
+surgical procedures
+Surgical Procedures Operative
+surgery
+Suture
+Suturing
+Therapeutic Intervention
+Therapeutics
+Therapy
+tourniquet
+transfusion
+treat
+treatment
+Ultrasonography Point of Care
+ultrasound
+Vasoconstrictor Agents
+vasopressors
+ventilation support
+Ventilators
+Vital Signs
+vital signs monitoring
+wound care
+Wound Dressing
+Wound Management
+X-Ray

dataset/scripts/01_filter_emergency.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# scripts/01_filter_emergency.py
+import os
+import re
+import pandas as pd
+# Function: Load keywords and print progress
+def load_keywords(path):
+    print(f"📥 Loading keywords from: {path}")
+    with open(path, "r", encoding="utf-8") as f:
+        kws = [line.strip() for line in f if line.strip()]
+    print(f"   Loaded {len(kws)} keywords")
+    return kws
+# Step 1: Read source data
+print("1️⃣ Reading source data...")
+source_path = "../dataset/guidelines_source_filtered.jsonl"
+df = pd.read_json(source_path, lines=True)
+print(f"   Loaded {len(df)} records")
+# Step 2: Load emergency keywords and match
+print("2️⃣ Loading emergency keywords and matching...")
+keywords = load_keywords("../keywords/emergency_keywords.txt")
+pattern = r"\b(?:" + "|".join(keywords) + r")\b"  # Using non-capturing groups (?:...)
+# Match keywords and add metadata columns
+df["matched"] = (
+    df["clean_text"]
+      .fillna("")  # Convert NaN to empty string
+      .str.findall(pattern, flags=re.IGNORECASE)
+      .apply(lambda lst: "|".join(lst) if lst else "")
+)
+df["has_emergency"] = df["matched"].str.len() > 0
+# Add metadata columns for future use
+df["type"] = "emergency"  # Document type identifier
+df["condition"] = ""      # Reserved for future condition mapping
+# Calculate average matches
+cnt_em = df["has_emergency"].sum()
+avg_matches = (
+    df[df["has_emergency"]]["matched"]
+      .str.count(r"\|")  # Escape the pipe
+      .add(1)
+      .mean()
+)
+print(f"   Matched {cnt_em} emergency-related records")
+print(f"   Average keywords per record: {avg_matches:.2f}")
+# Step 3: Save emergency subset
+print("3️⃣ Saving emergency subset...")
+out_dir = "../dataset/emergency"
+os.makedirs(out_dir, exist_ok=True)
+subset = df[df["has_emergency"]]
+subset.to_json(f"{out_dir}/emergency_subset.jsonl", orient="records", lines=True)
+subset.to_csv(f"{out_dir}/emergency_subset.csv", index=False)
+print(f"✅ Complete! Generated emergency subset with {len(subset)} records, saved in `{out_dir}`")

dataset/scripts/01_filter_emergency_opt.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+import re
+import json
+import pandas as pd
+from pathlib import Path
+class MedicalTermProcessor:
+    def __init__(self):
+        # Load emergency special terms from JSON
+        keywords_dir = Path("../keywords")
+        with open(keywords_dir / "special_terms_emergency.json", "r") as f:
+            self.emergency_terms_by_category = json.load(f)
+        # Flatten the nested structure for easy lookup
+        self.emergency_special_terms = {}
+        for category in self.emergency_terms_by_category.values():
+            self.emergency_special_terms.update(category)
+    def get_all_variants(self):
+        """Get all term variants including special terms"""
+        variants = []
+        for term_list in self.emergency_special_terms.values():
+            variants.extend(term_list)
+        return variants
+    def standardize_term(self, term: str) -> str:
+        """Convert a term to its standard form if it's a variant"""
+        term_lower = term.lower()
+        for standard_term, variants in self.emergency_special_terms.items():
+            if term_lower in [v.lower() for v in variants]:
+                return standard_term
+        return term
+    def process_matches(self, matches: list) -> str:
+        """Process matches to standardize terms and remove duplicates"""
+        if not matches:
+            return ""
+        # Standardize terms
+        standardized = [self.standardize_term(match) for match in matches]
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_matches = []
+        for term in standardized:
+            if term.lower() not in seen:
+                unique_matches.append(term)
+                seen.add(term.lower())
+        return "|".join(unique_matches)
+# Function: Load keywords and print progress
+def load_keywords(path, processor):
+    print(f"📥 Loading keywords from: {path}")
+    # Load basic keywords
+    with open(path, "r", encoding="utf-8") as f:
+        basic_kws = [line.strip() for line in f if line.strip()]
+    # Add special term variants
+    special_kws = processor.get_all_variants()
+    all_kws = list(set(basic_kws + special_kws))  # Remove duplicates
+    print(f"   Loaded {len(all_kws)} keywords (including variants)")
+    return all_kws
+# Step 1: Read source data
+print("1️⃣ Reading source data...")
+source_path = "../dataset/guidelines_source_filtered.jsonl"
+df = pd.read_json(source_path, lines=True)
+print(f"   Loaded {len(df)} records")
+# Step 2: Load emergency keywords and match
+print("2️⃣ Loading emergency keywords and matching...")
+processor = MedicalTermProcessor()
+keywords = load_keywords("../keywords/emergency_keywords.txt", processor)
+pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
+# Match keywords and add metadata columns
+df["matched"] = (
+    df["clean_text"]
+      .fillna("")  # Convert NaN to empty string
+      .str.findall(pattern, flags=re.IGNORECASE)
+      .apply(lambda matches: processor.process_matches(matches))  # Use new process_matches method
+)
+df["has_emergency"] = df["matched"].str.len() > 0
+# Add metadata columns for future use
+df["type"] = "emergency"  # Document type identifier
+df["condition"] = ""      # Reserved for future condition mapping
+# Calculate average matches
+cnt_em = df["has_emergency"].sum()
+avg_matches = (
+    df[df["has_emergency"]]["matched"]
+      .str.count(r"\|")  # Escape the pipe
+      .add(1)
+      .mean()
+)
+print(f"   Matched {cnt_em} emergency-related records")
+print(f"   Average keywords per record: {avg_matches:.2f}")
+# Step 3: Save emergency subset
+print("3️⃣ Saving emergency subset...")
+out_dir = "../dataset/emergency"
+os.makedirs(out_dir, exist_ok=True)
+subset = df[df["has_emergency"]]
+# Save with _opt suffix to distinguish from original files
+subset.to_json(f"{out_dir}/emergency_subset_opt.jsonl", orient="records", lines=True)
+subset.to_csv(f"{out_dir}/emergency_subset_opt.csv", index=False)
+print(f"✅ Complete! Generated emergency subset with {len(subset)} records, saved in `{out_dir}` with _opt suffix")

dataset/scripts/02_filter_treatment.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# scripts/02_filter_treatment.py
+import os
+import re
+import pandas as pd
+def preprocess_keywords(keywords_file):
+    """Load and preprocess treatment keywords"""
+    print(f"📥 Loading keywords from: {keywords_file}")
+    # Special medical terms with common variants
+    special_terms = {
+        'x-ray': ['x-ray', 'x ray', 'xray'],
+        'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
+        'point-of-care': ['point-of-care', 'point of care']
+    }
+    # Read and preprocess keywords
+    with open(keywords_file, "r", encoding="utf-8") as f:
+        keywords = [line.strip().lower() for line in f if line.strip()]
+    # Process keywords and handle special terms
+    processed_keywords = []
+    for kw in keywords:
+        if kw in special_terms:
+            processed_keywords.extend(special_terms[kw])
+        else:
+            processed_keywords.append(kw)
+    print(f"   Loaded {len(keywords)} base keywords")
+    print(f"   Processed into {len(processed_keywords)} keyword variants")
+    return processed_keywords
+def create_regex_pattern(keywords):
+    """Create compiled regex pattern with word boundaries"""
+    pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
+    return re.compile(pattern, re.IGNORECASE)
+# Step 1: Read source data
+print("1️⃣ Reading emergency subset...")
+emergency_path = "../dataset/emergency/emergency_subset.jsonl"
+df = pd.read_json(emergency_path, lines=True)
+print(f"   Loaded {len(df)} emergency records")
+print(f"   Contains emergency keywords in 'matched' column")
+# Step 2: Load treatment keywords and match
+print("2️⃣ Loading treatment keywords and matching...")
+treatment_keywords = preprocess_keywords("../keywords/treatment_keywords.txt")
+pattern = create_regex_pattern(treatment_keywords)
+# Step 3: Process text and match keywords
+print("3️⃣ Processing text and matching keywords...")
+# Create lowercase version of text for matching
+df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
+# Match treatment keywords and add metadata columns
+# Note: Preserving original 'matched' column from emergency subset
+df["treatment_matched"] = (
+    df["clean_text_lower"]
+    .apply(lambda text: "|".join(pattern.findall(text)) or "")
+)
+df["has_treatment"] = df["treatment_matched"].str.len() > 0
+# Add metadata columns for future use
+df["type"] = "treatment"  # Document type identifier
+df["condition"] = ""      # Reserved for future condition mapping
+# Verify columns
+print("   Verifying columns...")
+print(f"   - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
+print(f"   - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
+# Calculate statistics
+cnt_treat = df["has_treatment"].sum()
+avg_matches = (
+    df[df["has_treatment"]]["treatment_matched"]
+      .str.count(r"\|")
+      .add(1)
+      .mean()
+)
+print(f"   Found {cnt_treat} treatment-related records")
+print(f"   Average treatment keywords per record: {avg_matches:.2f}")
+# Step 4: Save treatment subset
+print("4️⃣ Saving treatment subset...")
+out_dir = "../dataset/emergency_treatment"
+os.makedirs(out_dir, exist_ok=True)
+# Select records with treatment keywords
+subset = df[df["has_treatment"]].copy()  # Use copy to avoid SettingWithCopyWarning
+# Verify final subset columns
+print("   Final subset columns:")
+print(f"   - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
+print(f"   - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
+subset.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
+subset.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
+print(f"✅ Generated treatment subset with {len(subset)} records")
+print(f"   Saved in: {out_dir}")
+print(f"   Contains both emergency and treatment keywords")

dataset/scripts/02_filter_treatment_opt.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import re
+import json
+import pandas as pd
+from pathlib import Path
+class MedicalTermProcessor:
+    def __init__(self):
+        # Load treatment special terms from JSON
+        keywords_dir = Path("../keywords")
+        with open(keywords_dir / "special_terms_treatment.json", "r") as f:
+            self.treatment_terms_by_category = json.load(f)
+        # Flatten the nested structure for easy lookup
+        self.treatment_special_terms = {}
+        for category in self.treatment_terms_by_category.values():
+            self.treatment_special_terms.update(category)
+    def get_all_variants(self):
+        """Get all term variants including special terms"""
+        variants = []
+        for term_list in self.treatment_special_terms.values():
+            variants.extend(term_list)
+        return variants
+    def standardize_term(self, term: str) -> str:
+        """Convert a term to its standard form if it's a variant"""
+        term_lower = term.lower()
+        for standard_term, variants in self.treatment_special_terms.items():
+            if term_lower in [v.lower() for v in variants]:
+                return standard_term
+        return term
+    def process_matches(self, matches: list) -> str:
+        """Process matches to standardize terms and remove duplicates"""
+        if not matches:
+            return ""
+        # Standardize terms
+        standardized = [self.standardize_term(match) for match in matches]
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_matches = []
+        for term in standardized:
+            if term.lower() not in seen:
+                unique_matches.append(term)
+                seen.add(term.lower())
+        return "|".join(unique_matches)
+def load_keywords(path, processor):
+    """Load and preprocess treatment keywords"""
+    print(f"📥 Loading keywords from: {path}")
+    # Load basic keywords
+    with open(path, "r", encoding="utf-8") as f:
+        basic_kws = [line.strip() for line in f if line.strip()]
+    # Add special term variants
+    special_kws = processor.get_all_variants()
+    all_kws = list(set(basic_kws + special_kws))  # Remove duplicates
+    print(f"   Loaded {len(all_kws)} keywords (including variants)")
+    return all_kws
+# Step 1: Read optimized emergency subset
+print("1️⃣ Reading optimized emergency subset...")
+emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
+df = pd.read_json(emergency_path, lines=True)
+print(f"   Loaded {len(df)} emergency records")
+print(f"   Contains emergency keywords in 'matched' column")
+# Step 2: Load treatment keywords and match
+print("2️⃣ Loading treatment keywords and matching...")
+processor = MedicalTermProcessor()
+keywords = load_keywords("../keywords/treatment_keywords.txt", processor)
+pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
+# Step 3: Process text and match keywords
+print("3️⃣ Processing text and matching keywords...")
+# Match treatment keywords and add metadata columns
+df["treatment_matched"] = (
+    df["clean_text"]
+      .fillna("")  # Convert NaN to empty string
+      .str.findall(pattern, flags=re.IGNORECASE)
+      .apply(lambda matches: processor.process_matches(matches))  # Use new process_matches method
+)
+df["has_treatment"] = df["treatment_matched"].str.len() > 0
+# Add metadata columns for future use
+df["type"] = "treatment"  # Document type identifier
+df["condition"] = ""      # Reserved for future condition mapping
+# Verify columns
+print("   Verifying columns...")
+print(f"   - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
+print(f"   - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
+# Calculate statistics
+cnt_treat = df["has_treatment"].sum()
+avg_matches = (
+    df[df["has_treatment"]]["treatment_matched"]
+      .str.count(r"\|")
+      .add(1)
+      .mean()
+)
+print(f"   Found {cnt_treat} treatment-related records")
+print(f"   Average treatment keywords per record: {avg_matches:.2f}")
+# Step 4: Save treatment subset
+print("4️⃣ Saving treatment subset...")
+out_dir = "../dataset/emergency_treatment"
+os.makedirs(out_dir, exist_ok=True)
+# Select records with treatment keywords
+subset = df[df["has_treatment"]].copy()  # Use copy to avoid SettingWithCopyWarning
+# Verify final subset columns
+print("   Final subset columns:")
+print(f"   - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
+print(f"   - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
+# Save with _opt suffix
+subset.to_json(f"{out_dir}/emergency_treatment_subset_opt.jsonl", orient="records", lines=True)
+subset.to_csv(f"{out_dir}/emergency_treatment_subset_opt.csv", index=False)
+print(f"✅ Generated optimized treatment subset with {len(subset)} records")
+print(f"   Saved in: {out_dir}")
+print(f"   Contains both emergency and treatment keywords")

dataset/scripts/check_subset_integrity.py ADDED Viewed

	@@ -0,0 +1,178 @@

+#!/usr/bin/env python3
+# /scripts/check_subset_integrity.py
+import pandas as pd
+import json
+from pathlib import Path
+from tqdm import tqdm
+def check_subset_sample(file_path, sample_size=100):
+    """
+    Check the first N rows of the subset file
+    """
+    print(f"\n{'='*60}")
+    print(f"📊 Sampling Analysis (first {sample_size} rows)")
+    print(f"{'='*60}")
+    # Read sample
+    print(f"\n1️⃣ Reading sample from: {file_path}")
+    sample_df = pd.read_csv(file_path, nrows=sample_size)
+    # Basic information
+    print("\n2️⃣ Basic Information:")
+    print(f"   Columns present: {', '.join(sample_df.columns.tolist())}")
+    # Check matched columns
+    print("\n3️⃣ Matched Columns Status:")
+    matched_stats = {
+        'matched': {
+            'non_null': int(sample_df['matched'].notna().sum()),
+            'non_empty': int((sample_df['matched'].str.len() > 0).sum()),
+            'unique_values': sample_df['matched'].nunique()
+        },
+        'treatment_matched': {
+            'non_null': int(sample_df['treatment_matched'].notna().sum()),
+            'non_empty': int((sample_df['treatment_matched'].str.len() > 0).sum()),
+            'unique_values': sample_df['treatment_matched'].nunique()
+        }
+    }
+    for col, stats in matched_stats.items():
+        print(f"\n   {col}:")
+        print(f"   - Non-null count: {stats['non_null']}/{sample_size}")
+        print(f"   - Non-empty count: {stats['non_empty']}/{sample_size}")
+        print(f"   - Unique values: {stats['unique_values']}")
+    # Sample rows with both matches
+    print("\n4️⃣ Sample Rows with Both Matches:")
+    both_matched = sample_df[
+        (sample_df['matched'].notna() & sample_df['matched'].str.len() > 0) &
+        (sample_df['treatment_matched'].notna() & sample_df['treatment_matched'].str.len() > 0)
+    ].head(3)
+    for idx, row in both_matched.iterrows():
+        print(f"\n   Row {idx}:")
+        print(f"   - Emergency keywords: {row['matched']}")
+        print(f"   - Treatment keywords: {row['treatment_matched']}")
+    return matched_stats
+def analyze_large_file(file_path, chunk_size=1000):
+    """
+    Analyze the entire file in chunks
+    """
+    print(f"\n{'='*60}")
+    print(f"📈 Full File Analysis (chunk size: {chunk_size})")
+    print(f"{'='*60}")
+    stats = {
+        'total_rows': 0,
+        'matched_stats': {
+            'non_null': 0,
+            'non_empty': 0
+        },
+        'treatment_matched_stats': {
+            'non_null': 0,
+            'non_empty': 0
+        },
+        'both_matched': 0
+    }
+    print("\n1️⃣ Processing file in chunks...")
+    chunks = pd.read_csv(file_path, chunksize=chunk_size)
+    for chunk in tqdm(chunks, desc="Analyzing chunks"):
+        # Update total rows
+        stats['total_rows'] += len(chunk)
+        # Update matched stats
+        stats['matched_stats']['non_null'] += chunk['matched'].notna().sum()
+        stats['matched_stats']['non_empty'] += (chunk['matched'].str.len() > 0).sum()
+        # Update treatment_matched stats
+        stats['treatment_matched_stats']['non_null'] += chunk['treatment_matched'].notna().sum()
+        stats['treatment_matched_stats']['non_empty'] += (chunk['treatment_matched'].str.len() > 0).sum()
+        # Update both matched count
+        stats['both_matched'] += (
+            (chunk['matched'].notna() & chunk['matched'].str.len() > 0) &
+            (chunk['treatment_matched'].notna() & chunk['treatment_matched'].str.len() > 0)
+        ).sum()
+    return stats
+def generate_report(sample_stats, full_stats, output_dir):
+    """
+    Generate and save analysis report
+    """
+    print(f"\n{'='*60}")
+    print(f"📝 Generating Report")
+    print(f"{'='*60}")
+    report = {
+        'sample_analysis': sample_stats,
+        'full_file_analysis': {
+            'total_records': int(full_stats['total_rows']),
+            'matched_column': {
+                'non_null_count': int(full_stats['matched_stats']['non_null']),
+                'non_empty_count': int(full_stats['matched_stats']['non_empty']),
+                'null_percentage': float(
+                    (full_stats['total_rows'] - full_stats['matched_stats']['non_null'])
+                    / full_stats['total_rows'] * 100
+                )
+            },
+            'treatment_matched_column': {
+                'non_null_count': int(full_stats['treatment_matched_stats']['non_null']),
+                'non_empty_count': int(full_stats['treatment_matched_stats']['non_empty']),
+                'null_percentage': float(
+                    (full_stats['total_rows'] - full_stats['treatment_matched_stats']['non_null'])
+                    / full_stats['total_rows'] * 100
+                )
+            },
+            'both_matched_count': int(full_stats['both_matched']),
+            'both_matched_percentage': float(
+                full_stats['both_matched'] / full_stats['total_rows'] * 100
+            )
+        }
+    }
+    # Create output directory
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Save report
+    report_file = output_dir / 'integrity_check_report.json'
+    with open(report_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    print(f"\nReport saved to: {report_file}")
+    # Print summary
+    print("\n📊 Summary:")
+    print(f"Total records: {report['full_file_analysis']['total_records']}")
+    print(f"Records with both matches: {report['full_file_analysis']['both_matched_count']} "
+          f"({report['full_file_analysis']['both_matched_percentage']:.2f}%)")
+    return report
+def main():
+    """
+    Main execution function
+    """
+    # Configuration
+    input_file = "../dataset/emergency_treatment/emergency_treatment_subset.csv"
+    output_dir = "../analysis/integrity_check"
+    print(f"\n🔍 Starting Subset Integrity Check")
+    print(f"Input file: {input_file}")
+    print(f"Output directory: {output_dir}")
+    # Run analysis
+    sample_stats = check_subset_sample(input_file)
+    full_stats = analyze_large_file(input_file)
+    report = generate_report(sample_stats, full_stats, output_dir)
+    print("\n✅ Integrity check complete!")
+if __name__ == "__main__":
+    main()

dataset/scripts/commit_message_20250726_special_terms.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+refactor: migrate special terms to JSON configuration
+BREAKING CHANGE: Move hardcoded special terms mapping to external JSON files
+1. Create New Configuration Files:
+- Add special_terms_emergency.json
+  - Organize emergency terms by categories (cardiac, respiratory, etc.)
+  - Include all existing mappings with standardized structure
+- Add special_terms_treatment.json
+  - Organize treatment terms by categories (imaging, medications, etc.)
+  - Maintain all existing term variants
+2. Update Processing Scripts:
+- Modify 01_filter_emergency_opt.py:
+  - Load terms from JSON configuration
+  - Add term standardization
+  - Implement deduplication
+  - Preserve category information
+- Modify 02_filter_treatment_opt.py:
+  - Similar updates for treatment terms
+  - Maintain consistent processing logic
+3. New Features:
+- Term standardization: Convert variants to standard form
+- Deduplication: Remove repeated terms while preserving order
+- Category-aware: Support for term categorization
+- Improved maintainability: Configuration separated from code
+4. Technical Details:
+- Use pathlib for file path handling
+- JSON structure supports hierarchical organization
+- Maintain backward compatibility
+- Add type hints for better code clarity
+Testing:
+- Verify JSON format
+- Confirm all mappings migrated correctly
+- Check term standardization
+- Validate deduplication logic

dataset/scripts/compare_subsets_opt.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# /scripts/compare_subsets_opt.py
+import pandas as pd
+from pathlib import Path
+from datetime import datetime
+def load_and_compare_subsets(format_type='csv'):
+    """
+    Load and compare the first 10 records from both optimized subsets
+    Args:
+        format_type (str): 'csv' or 'jsonl'
+    """
+    # Prepare output file
+    output_dir = Path("../analysis")
+    output_dir.mkdir(exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = output_dir / f"subset_comparison_first10_records_{timestamp}.md"
+    # Initialize markdown content
+    md_content = []
+    md_content.append("# Optimized Subsets Comparison Report\n")
+    md_content.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+    md_content.append(f"File format: {format_type.upper()}\n")
+    # Set file paths based on format
+    if format_type == 'csv':
+        emergency_path = "../dataset/emergency/emergency_subset_opt.csv"
+        treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
+        # Load CSV files
+        emergency_df = pd.read_csv(emergency_path)
+        treatment_df = pd.read_csv(treatment_path)
+    else:  # jsonl
+        emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
+        treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.jsonl"
+        # Load JSONL files
+        emergency_df = pd.read_json(emergency_path, lines=True)
+        treatment_df = pd.read_json(treatment_path, lines=True)
+    # Print and save basic statistics
+    print("\n📊 Basic Statistics:")
+    print("-" * 40)
+    md_content.append("\n## Basic Statistics\n")
+    stats = [
+        f"- Emergency subset total records: {len(emergency_df)}",
+        f"- Emergency+Treatment subset total records: {len(treatment_df)}",
+        f"- Avg Emergency Text Length: {emergency_df['clean_text'].str.len().mean():.2f}",
+        f"- Avg Treatment Text Length: {treatment_df['clean_text'].str.len().mean():.2f}"
+    ]
+    # Calculate average keywords using pattern
+    pattern = r'\|'
+    emergency_avg = emergency_df['matched'].str.count(pattern).add(1).mean()
+    treatment_avg = treatment_df['matched'].str.count(pattern).add(1).mean()
+    stats.extend([
+        f"- Avg Emergency Keywords: {emergency_avg:.2f}",
+        f"- Avg Treatment Keywords: {treatment_avg:.2f}"
+    ])
+    # Print to console and add to markdown
+    for stat in stats:
+        print(stat.replace("- ", ""))
+    md_content.extend(stats)
+    # Compare first 10 records from Emergency subset
+    print("\n🔍 First 10 records from Emergency Subset:")
+    print("-" * 80)
+    md_content.append("\n## Emergency Subset (First 10 Records)\n")
+    for idx, row in emergency_df.head(10).iterrows():
+        print(f"\nRecord #{idx+1}")
+        print(f"Text preview: {row['clean_text'][:100]}...")
+        print(f"Matched keywords: {row['matched']}")
+        print(f"Text length: {len(row['clean_text'])}")
+        print("-" * 40)
+        md_content.extend([
+            f"\n### Record {idx+1}",
+            "```",
+            f"Text preview: {row['clean_text'][:100]}...",
+            f"Matched keywords: {row['matched']}",
+            f"Text length: {len(row['clean_text'])}",
+            "```\n"
+        ])
+    # Compare first 10 records from Emergency+Treatment subset
+    print("\n🔍 First 10 records from Emergency+Treatment Subset:")
+    print("-" * 80)
+    md_content.append("\n## Emergency+Treatment Subset (First 10 Records)\n")
+    for idx, row in treatment_df.head(10).iterrows():
+        print(f"\nRecord #{idx+1}")
+        print(f"Text preview: {row['clean_text'][:100]}...")
+        print(f"Emergency keywords: {row['matched']}")
+        print(f"Treatment keywords: {row['treatment_matched']}")
+        print(f"Text length: {len(row['clean_text'])}")
+        print("-" * 40)
+        md_content.extend([
+            f"\n### Record {idx+1}",
+            "```",
+            f"Text preview: {row['clean_text'][:100]}...",
+            f"Emergency keywords: {row['matched']}",
+            f"Treatment keywords: {row['treatment_matched']}",
+            f"Text length: {len(row['clean_text'])}",
+            "```\n"
+        ])
+    # Save markdown content
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write('\n'.join(md_content))
+    print(f"\n✅ Comparison complete!")
+    print(f"Report saved to: {output_file}")
+if __name__ == "__main__":
+    # Compare using CSV format
+    print("\nComparing CSV files...")
+    load_and_compare_subsets('csv')
+    # Compare using JSONL format
+    print("\nComparing JSONL files...")
+    load_and_compare_subsets('jsonl')

dataset/scripts/data_explorer.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# /scripts/data_explorer.py
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from pathlib import Path
+import json
+def analyze_subset(file_path, keywords_path, output_dir="analysis"):
+    """Analyze subset data quality and distribution"""
+    print(f"\n{'='*50}")
+    print(f"Starting dataset analysis: {file_path}")
+    print(f"Using keywords file: {keywords_path}")
+    print(f"Output directory: {output_dir}")
+    print(f"{'='*50}\n")
+    # Load data
+    print("1️⃣ Loading data...")
+    df = pd.read_csv(file_path)
+    output_dir = Path(output_dir)
+    # 1. Basic statistics
+    print("\n2️⃣ Calculating basic statistics...")
+    total = len(df)
+    df['text_length'] = df['clean_text'].str.len()
+    avg_len = df['text_length'].mean()
+    print(f"Total records: {total}")
+    print(f"Average text length: {avg_len:.2f}")
+    # Initialize statistics dictionary with native Python types
+    stats = {
+        'basic_statistics': {
+            'total_records': int(total),
+            'avg_length': float(avg_len)
+        },
+        'keyword_statistics': {}
+    }
+    # 2. Keyword analysis
+    print("\n3️⃣ Performing keyword analysis...")
+    with open(keywords_path, 'r') as f:
+        keywords = [line.strip() for line in f if line.strip()]
+    print(f"Loaded {len(keywords)} keywords")
+    # Count keywords and store in stats
+    for keyword in keywords:
+        cnt = df['clean_text'].str.contains(keyword, case=False).sum()
+        stats['keyword_statistics'][keyword] = int(cnt)
+        print(f"  - {keyword}: {cnt} records")
+    # 3. Visualization
+    print("\n4️⃣ Generating visualizations...")
+    output_path = Path(output_dir) / "plots"
+    output_path.mkdir(parents=True, exist_ok=True)
+    print(f"Charts will be saved in: {output_path}")
+    # 3.1 Keyword distribution chart
+    print("  - Generating keyword distribution chart...")
+    plt.figure(figsize=(15, 8))
+    plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
+    plt.xticks(rotation=45, ha='right')
+    # TODO: change the title to the name of the subset
+    plt.title('Keyword Distribution for Emergency Subset')
+    plt.xlabel('Keywords')
+    plt.ylabel('Match Count')
+    # TODO: change the name of the file to the name of the subset
+    plt.savefig(output_path / "keyword_distribution_emergency_subset.png", bbox_inches='tight')
+    plt.close()
+    # 3.2 Text length distribution
+    print("  - Generating text length distribution...")
+    plt.figure(figsize=(10, 6))
+    df['text_length'].hist(bins=50)
+    plt.title('Text Length Distribution')
+    plt.xlabel('Text Length')
+    plt.ylabel('Frequency')
+    # TODO: change the name of the file to the name of the subset
+    plt.savefig(output_path / "text_length_dist_emergency_subset.png", bbox_inches='tight')
+    plt.close()
+    # 3.3 Keyword co-occurrence analysis
+    print("  - Generating keyword co-occurrence heatmap...")
+    cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
+    for text in df['clean_text']:
+        present_keywords = [k for k in keywords if k.lower() in text.lower()]
+        for i, k1 in enumerate(present_keywords):
+            for j, k2 in enumerate(present_keywords):
+                if i != j:
+                    cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
+    plt.figure(figsize=(12, 8))
+    sns.heatmap(cooccurrence_matrix,
+                xticklabels=keywords,
+                yticklabels=keywords,
+                cmap='YlOrRd')
+    plt.title('Keyword Co-occurrence Heatmap')
+    plt.xticks(rotation=45, ha='right')
+    plt.tight_layout()
+    # TODO: change the name of the file to the name of the subset
+    plt.savefig(output_path / "keyword_cooccurrence_emergency_subset.png", bbox_inches='tight')
+    plt.close()
+    # 4. Save statistics
+    print("\n5️⃣ Saving statistics...")
+    stats_path = Path(output_dir) / "stats"
+    stats_path.mkdir(parents=True, exist_ok=True)
+    # TODO: change the name of the file to the name of the subset
+    stats_file = stats_path / "analysis_stats_emergency_subset.json"
+    with open(stats_file, 'w', encoding='utf-8') as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+    print(f"Statistics saved to: {stats_file}")
+    print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")
+if __name__ == "__main__":
+    # Set file paths
+    emergency_subset = "../dataset/emergency/emergency_subset.csv"
+    emergency_keywords = "../keywords/emergency_keywords.txt"
+    output_dir = "../analysis"
+    # Run analysis
+    analyze_subset(emergency_subset, emergency_keywords, output_dir)

dataset/scripts/data_explorer_opt.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# /scripts/data_explorer_opt.py
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from pathlib import Path
+import json
+def analyze_subset(file_path, keywords_path, output_dir="analysis", subset_name="emergency"):
+    """Analyze subset data quality and distribution"""
+    print(f"\n{'='*50}")
+    print(f"Starting optimized dataset analysis: {file_path}")
+    print(f"Using keywords file: {keywords_path}")
+    print(f"Output directory: {output_dir}")
+    print(f"{'='*50}\n")
+    # Load data
+    print("1️⃣ Loading data...")
+    df = pd.read_csv(file_path)
+    output_dir = Path(output_dir)
+    # 1. Basic statistics
+    print("\n2️⃣ Calculating basic statistics...")
+    total = len(df)
+    df['text_length'] = df['clean_text'].str.len()
+    avg_len = df['text_length'].mean()
+    print(f"Total records: {total}")
+    print(f"Average text length: {avg_len:.2f}")
+    # Initialize statistics dictionary with native Python types
+    stats = {
+        'basic_statistics': {
+            'total_records': int(total),
+            'avg_length': float(avg_len)
+        },
+        'keyword_statistics': {}
+    }
+    # 2. Keyword analysis
+    print("\n3️⃣ Performing keyword analysis...")
+    with open(keywords_path, 'r') as f:
+        keywords = [line.strip() for line in f if line.strip()]
+    print(f"Loaded {len(keywords)} keywords")
+    # Count keywords and store in stats
+    for keyword in keywords:
+        cnt = df['clean_text'].str.contains(keyword, case=False).sum()
+        stats['keyword_statistics'][keyword] = int(cnt)
+        print(f"  - {keyword}: {cnt} records")
+    # 3. Visualization
+    print("\n4️⃣ Generating visualizations...")
+    output_path = Path(output_dir) / "plots"
+    output_path.mkdir(parents=True, exist_ok=True)
+    print(f"Charts will be saved in: {output_path}")
+    # 3.1 Keyword distribution chart
+    print("  - Generating keyword distribution chart...")
+    plt.figure(figsize=(15, 8))
+    plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
+    plt.xticks(rotation=45, ha='right')
+    plt.title(f'Keyword Distribution for {subset_name.capitalize()} Subset (Optimized)')
+    plt.xlabel('Keywords')
+    plt.ylabel('Match Count')
+    plt.savefig(output_path / f"keyword_distribution_{subset_name}_subset_opt.png", bbox_inches='tight')
+    plt.close()
+    # 3.2 Text length distribution
+    print("  - Generating text length distribution...")
+    plt.figure(figsize=(10, 6))
+    df['text_length'].hist(bins=50)
+    plt.title(f'Text Length Distribution ({subset_name.capitalize()} Subset - Optimized)')
+    plt.xlabel('Text Length')
+    plt.ylabel('Frequency')
+    plt.savefig(output_path / f"text_length_dist_{subset_name}_subset_opt.png", bbox_inches='tight')
+    plt.close()
+    # 3.3 Keyword co-occurrence analysis
+    print("  - Generating keyword co-occurrence heatmap...")
+    cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
+    for text in df['clean_text']:
+        present_keywords = [k for k in keywords if k.lower() in text.lower()]
+        for i, k1 in enumerate(present_keywords):
+            for j, k2 in enumerate(present_keywords):
+                if i != j:
+                    cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
+    plt.figure(figsize=(12, 8))
+    sns.heatmap(cooccurrence_matrix,
+                xticklabels=keywords,
+                yticklabels=keywords,
+                cmap='YlOrRd')
+    plt.title(f'Keyword Co-occurrence Heatmap ({subset_name.capitalize()} Subset - Optimized)')
+    plt.xticks(rotation=45, ha='right')
+    plt.tight_layout()
+    plt.savefig(output_path / f"keyword_cooccurrence_{subset_name}_subset_opt.png", bbox_inches='tight')
+    plt.close()
+    # 4. Save statistics
+    print("\n5️⃣ Saving statistics...")
+    stats_path = Path(output_dir) / "stats"
+    stats_path.mkdir(parents=True, exist_ok=True)
+    stats_file = stats_path / f"analysis_stats_{subset_name}_subset_opt.json"
+    with open(stats_file, 'w', encoding='utf-8') as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+    print(f"Statistics saved to: {stats_file}")
+    print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")
+if __name__ == "__main__":
+    # Set file paths for optimized version
+    emergency_subset = "../dataset/emergency/emergency_subset_opt.csv"
+    emergency_keywords = "../keywords/emergency_keywords.txt"
+    output_dir = "../analysis"
+    # Run analysis
+    analyze_subset(emergency_subset, emergency_keywords, output_dir, "emergency")

dataset/scripts/data_explorer_treatment.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# /scripts/data_explorer_treatment.py
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from pathlib import Path
+import json
+from tqdm import tqdm
+import re
+def calculate_density(matches, text_length):
+    """
+    Calculate keyword density per 1000 words
+    Args:
+        matches: Number of keyword matches
+        text_length: Total text length
+    Returns:
+        float: Density per 1000 words
+    """
+    return (matches / text_length) * 1000
+def analyze_treatment_subset(
+    treatment_file_path,
+    emergency_keywords_path,
+    treatment_keywords_path,
+    output_dir="analysis_treatment"
+):
+    """
+    Specialized analysis for treatment subset focusing on:
+    1. Dual keyword analysis (emergency + treatment)
+    2. Path B effectiveness validation
+    3. Condition mapping data preparation
+    4. RAG readiness assessment
+    """
+    print(f"\n{'='*60}")
+    print(f"Treatment Subset Analysis")
+    print(f"Treatment file: {treatment_file_path}")
+    print(f"Emergency keywords: {emergency_keywords_path}")
+    print(f"Treatment keywords: {treatment_keywords_path}")
+    print(f"Output directory: {output_dir}")
+    print(f"{'='*60}\n")
+    # Load data
+    print("1️⃣ Loading treatment subset data...")
+    df = pd.read_csv(treatment_file_path)
+    output_dir = Path(output_dir)
+    # Load keyword lists
+    print("2️⃣ Loading keyword lists...")
+    with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
+        emergency_keywords = [line.strip() for line in f if line.strip()]
+    with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
+        treatment_keywords = [line.strip() for line in f if line.strip()]
+    print(f"   Emergency keywords: {len(emergency_keywords)}")
+    print(f"   Treatment keywords: {len(treatment_keywords)}")
+    # Basic statistics
+    print("\n3️⃣ Computing basic statistics...")
+    total_records = len(df)
+    df['text_length'] = df['clean_text'].str.len()
+    avg_length = df['text_length'].mean()
+    print(f"   Total treatment records: {total_records}")
+    print(f"   Average text length: {avg_length:.2f} characters")
+    # Initialize comprehensive statistics
+    stats = {
+        'basic_statistics': {
+            'total_records': int(total_records),
+            'avg_text_length': float(avg_length),
+            'emergency_keywords_count': len(emergency_keywords),
+            'treatment_keywords_count': len(treatment_keywords)
+        },
+        'emergency_keyword_stats': {},
+        'treatment_keyword_stats': {},
+        'cooccurrence_analysis': {},
+        'path_b_validation': {},
+        'condition_mapping_candidates': {}
+    }
+    # Emergency keyword analysis in treatment subset
+    print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
+    for keyword in emergency_keywords:
+        count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
+        stats['emergency_keyword_stats'][keyword] = int(count)
+        print(f"   Emergency: {keyword} -> {count} records")
+    # Treatment keyword analysis
+    print("\n5️⃣ Analyzing treatment keywords...")
+    for keyword in treatment_keywords:
+        count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
+        stats['treatment_keyword_stats'][keyword] = int(count)
+        print(f"   Treatment: {keyword} -> {count} records")
+    # Step 6: Co-occurrence analysis
+    print("\n6️⃣ Computing keyword co-occurrence patterns...")
+    # Initialize matrices for full dataset
+    emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
+    treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
+    # Pre-process text
+    print("   Pre-processing text...")
+    df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
+    # Process all emergency keywords
+    print("\n   Processing all emergency keywords...")
+    for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
+        # Using word boundary instead of negative lookbehind/lookahead
+        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
+        emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
+        matches = emergency_matrix[:, i].sum()
+        print(f"   - {keyword}: {matches} matches")
+    # Process all treatment keywords
+    print("\n   Processing all treatment keywords...")
+    for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
+        # Using word boundary instead of negative lookbehind/lookahead
+        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
+        treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
+        matches = treatment_matrix[:, i].sum()
+        print(f"   - {keyword}: {matches} matches")
+    # Compute co-occurrence matrix
+    print("\n   Computing co-occurrence matrix...")
+    cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
+    print("   Computation completed successfully")
+    # Extract results
+    print("   Extracting co-occurrence pairs...")
+    cooccurrence_pairs = []
+    for i, em_kw in enumerate(emergency_keywords):
+        for j, tr_kw in enumerate(treatment_keywords):
+            count = int(cooc_matrix[i, j])
+            if count > 0:
+                cooccurrence_pairs.append({
+                    'emergency_keyword': em_kw,
+                    'treatment_keyword': tr_kw,
+                    'cooccurrence_count': count,
+                    'percentage': float(count / len(df) * 100)
+                })
+    # Sort and store results
+    cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
+    stats['cooccurrence_analysis'] = cooccurrence_pairs[:20]  # Top 20 pairs
+    print(f"   Found {len(cooccurrence_pairs)} co-occurrence pairs")
+    print("   Top 5 co-occurrence pairs:")
+    for i, pair in enumerate(cooccurrence_pairs[:5]):
+        print(f"     {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
+    # Step 7: Path B validation metrics
+    print("\n7️⃣ Validating Path B strategy effectiveness...")
+    # Compute keyword density with progress bar
+    print("   Computing keyword density...")
+    with tqdm(total=2, desc="Density calculation") as pbar:
+        # Calculate density per 1000 words for both emergency and treatment keywords
+        emergency_density = calculate_density(
+            emergency_matrix.sum(axis=1),
+            df['text_length']
+        )
+        pbar.update(1)
+        treatment_density = calculate_density(
+            treatment_matrix.sum(axis=1),
+            df['text_length']
+        )
+        pbar.update(1)
+    # Store density in dataframe for visualization
+    df['emergency_keyword_density'] = emergency_density
+    df['treatment_keyword_density'] = treatment_density
+    # Calculate statistics with the new density metrics
+    stats['path_b_validation'] = {
+        'avg_emergency_density': float(np.mean(emergency_density)),
+        'avg_treatment_density': float(np.mean(treatment_density)),
+        'high_density_records': int(sum(
+            (emergency_density >= np.percentile(emergency_density, 75)) &
+            (treatment_density >= np.percentile(treatment_density, 75))
+        )),
+        'precision_estimate': float(sum(
+            (emergency_density > 0) & (treatment_density > 0)
+        ) / len(df))
+    }
+    # Print detailed results
+    print("\n   Results:")
+    print(f"   - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
+    print(f"   - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
+    print(f"   - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
+    print(f"   - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
+    # Sample distribution analysis
+    print("\n   Density Distribution:")
+    density_counts = pd.DataFrame({
+        'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
+        'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
+    }).value_counts().head()
+    print("   Top 5 density combinations (emergency, treatment):")
+    for (em, tr), count in density_counts.items():
+        print(f"   - {count} documents have {em} emergency and {tr} treatment density")
+    # Visualization
+    print("\n8️⃣ Generating visualizations...")
+    output_plots = output_dir / "plots"
+    output_plots.mkdir(parents=True, exist_ok=True)
+    # 1. Keyword density scatter plot with improved visualization
+    plt.figure(figsize=(12, 8))
+    plt.scatter(
+        emergency_density,
+        treatment_density,
+        alpha=0.6,
+        c=np.log1p(df['text_length']),  # Color by log text length
+        cmap='viridis'
+    )
+    plt.colorbar(label='Log Text Length')
+    plt.xlabel('Emergency Keyword Density (per 1000 words)')
+    plt.ylabel('Treatment Keyword Density (per 1000 words)')
+    plt.title('Emergency vs Treatment Keyword Density')
+    plt.grid(True, alpha=0.3)
+    # Add mean lines
+    plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
+    plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
+    plt.legend()
+    plt.savefig(output_plots / "keyword_density_scatter.png", bbox_inches='tight', dpi=300)
+    plt.close()
+    # Save comprehensive statistics
+    print("\n9️⃣ Saving analysis results...")
+    stats_dir = output_dir / "stats"
+    stats_dir.mkdir(parents=True, exist_ok=True)
+    with open(stats_dir / "treatment_analysis_comprehensive.json", 'w', encoding='utf-8') as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+    print(f"✅ Treatment subset analysis complete!")
+    print(f"   Results saved to: {output_dir}")
+    print(f"   Plots: {output_plots}")
+    print(f"   Statistics: {stats_dir}")
+    return stats
+if __name__ == "__main__":
+    # Configuration
+    treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset.csv"
+    emergency_keywords = "../keywords/emergency_keywords.txt"
+    treatment_keywords = "../keywords/treatment_keywords.txt"
+    output_directory = "../analysis_treatment"
+    # Run analysis
+    results = analyze_treatment_subset(
+        treatment_file,
+        emergency_keywords,
+        treatment_keywords,
+        output_directory
+    )

dataset/scripts/data_explorer_treatment_opt.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# /scripts/data_explorer_treatment_opt.py
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from pathlib import Path
+import json
+from tqdm import tqdm
+import re
+def calculate_density(matches, text_length):
+    """
+    Calculate keyword density per 1000 words
+    Args:
+        matches: Number of keyword matches
+        text_length: Total text length
+    Returns:
+        float: Density per 1000 words
+    """
+    return (matches / text_length) * 1000
+def analyze_treatment_subset(
+    treatment_file_path,
+    emergency_keywords_path,
+    treatment_keywords_path,
+    output_dir="analysis_treatment_opt"  # Updated default output directory
+):
+    """
+    Specialized analysis for optimized treatment subset focusing on:
+    1. Dual keyword analysis (emergency + treatment)
+    2. Path B effectiveness validation
+    3. Condition mapping data preparation
+    4. RAG readiness assessment
+    """
+    print(f"\n{'='*60}")
+    print(f"Treatment Subset Analysis (Optimized Version)")
+    print(f"Treatment file: {treatment_file_path}")
+    print(f"Emergency keywords: {emergency_keywords_path}")
+    print(f"Treatment keywords: {treatment_keywords_path}")
+    print(f"Output directory: {output_dir}")
+    print(f"{'='*60}\n")
+    # Load data
+    print("1️⃣ Loading optimized treatment subset data...")
+    df = pd.read_csv(treatment_file_path)
+    output_dir = Path(output_dir)
+    # Load keyword lists
+    print("2️⃣ Loading keyword lists...")
+    with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
+        emergency_keywords = [line.strip() for line in f if line.strip()]
+    with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
+        treatment_keywords = [line.strip() for line in f if line.strip()]
+    print(f"   Emergency keywords: {len(emergency_keywords)}")
+    print(f"   Treatment keywords: {len(treatment_keywords)}")
+    # Basic statistics
+    print("\n3️⃣ Computing basic statistics...")
+    total_records = len(df)
+    df['text_length'] = df['clean_text'].str.len()
+    avg_length = df['text_length'].mean()
+    print(f"   Total treatment records: {total_records}")
+    print(f"   Average text length: {avg_length:.2f} characters")
+    # Initialize comprehensive statistics
+    stats = {
+        'basic_statistics': {
+            'total_records': int(total_records),
+            'avg_text_length': float(avg_length),
+            'emergency_keywords_count': len(emergency_keywords),
+            'treatment_keywords_count': len(treatment_keywords)
+        },
+        'emergency_keyword_stats': {},
+        'treatment_keyword_stats': {},
+        'cooccurrence_analysis': {},
+        'path_b_validation': {},
+        'condition_mapping_candidates': {}
+    }
+    # Emergency keyword analysis in treatment subset
+    print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
+    for keyword in emergency_keywords:
+        count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
+        stats['emergency_keyword_stats'][keyword] = int(count)
+        print(f"   Emergency: {keyword} -> {count} records")
+    # Treatment keyword analysis
+    print("\n5️⃣ Analyzing treatment keywords...")
+    for keyword in treatment_keywords:
+        count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
+        stats['treatment_keyword_stats'][keyword] = int(count)
+        print(f"   Treatment: {keyword} -> {count} records")
+    # Step 6: Co-occurrence analysis
+    print("\n6️⃣ Computing keyword co-occurrence patterns...")
+    # Initialize matrices for full dataset
+    emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
+    treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
+    # Pre-process text
+    print("   Pre-processing text...")
+    df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
+    # Process all emergency keywords
+    print("\n   Processing all emergency keywords...")
+    for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
+        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
+        emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
+        matches = emergency_matrix[:, i].sum()
+        print(f"   - {keyword}: {matches} matches")
+    # Process all treatment keywords
+    print("\n   Processing all treatment keywords...")
+    for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
+        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
+        treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
+        matches = treatment_matrix[:, i].sum()
+        print(f"   - {keyword}: {matches} matches")
+    # Compute co-occurrence matrix
+    print("\n   Computing co-occurrence matrix...")
+    cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
+    print("   Computation completed successfully")
+    # Extract results
+    print("   Extracting co-occurrence pairs...")
+    cooccurrence_pairs = []
+    for i, em_kw in enumerate(emergency_keywords):
+        for j, tr_kw in enumerate(treatment_keywords):
+            count = int(cooc_matrix[i, j])
+            if count > 0:
+                cooccurrence_pairs.append({
+                    'emergency_keyword': em_kw,
+                    'treatment_keyword': tr_kw,
+                    'cooccurrence_count': count,
+                    'percentage': float(count / len(df) * 100)
+                })
+    # Sort and store results
+    cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
+    stats['cooccurrence_analysis'] = cooccurrence_pairs[:20]  # Top 20 pairs
+    print(f"   Found {len(cooccurrence_pairs)} co-occurrence pairs")
+    print("   Top 5 co-occurrence pairs:")
+    for i, pair in enumerate(cooccurrence_pairs[:5]):
+        print(f"     {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
+    # Step 7: Path B validation metrics
+    print("\n7️⃣ Validating Path B strategy effectiveness...")
+    # Compute keyword density with progress bar
+    print("   Computing keyword density...")
+    with tqdm(total=2, desc="Density calculation") as pbar:
+        emergency_density = calculate_density(
+            emergency_matrix.sum(axis=1),
+            df['text_length']
+        )
+        pbar.update(1)
+        treatment_density = calculate_density(
+            treatment_matrix.sum(axis=1),
+            df['text_length']
+        )
+        pbar.update(1)
+    # Store density in dataframe for visualization
+    df['emergency_keyword_density'] = emergency_density
+    df['treatment_keyword_density'] = treatment_density
+    # Calculate statistics with the new density metrics
+    stats['path_b_validation'] = {
+        'avg_emergency_density': float(np.mean(emergency_density)),
+        'avg_treatment_density': float(np.mean(treatment_density)),
+        'high_density_records': int(sum(
+            (emergency_density >= np.percentile(emergency_density, 75)) &
+            (treatment_density >= np.percentile(treatment_density, 75))
+        )),
+        'precision_estimate': float(sum(
+            (emergency_density > 0) & (treatment_density > 0)
+        ) / len(df))
+    }
+    # Print detailed results
+    print("\n   Results:")
+    print(f"   - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
+    print(f"   - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
+    print(f"   - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
+    print(f"   - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
+    # Sample distribution analysis
+    print("\n   Density Distribution:")
+    density_counts = pd.DataFrame({
+        'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
+        'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
+    }).value_counts().head()
+    print("   Top 5 density combinations (emergency, treatment):")
+    for (em, tr), count in density_counts.items():
+        print(f"   - {count} documents have {em} emergency and {tr} treatment density")
+    # Visualization
+    print("\n8️⃣ Generating visualizations...")
+    output_plots = output_dir / "plots"
+    output_plots.mkdir(parents=True, exist_ok=True)
+    # 1. Keyword density scatter plot with improved visualization
+    plt.figure(figsize=(12, 8))
+    plt.scatter(
+        emergency_density,
+        treatment_density,
+        alpha=0.6,
+        c=np.log1p(df['text_length']),
+        cmap='viridis'
+    )
+    plt.colorbar(label='Log Text Length')
+    plt.xlabel('Emergency Keyword Density (per 1000 words)')
+    plt.ylabel('Treatment Keyword Density (per 1000 words)')
+    plt.title('Emergency vs Treatment Keyword Density (Optimized)')
+    plt.grid(True, alpha=0.3)
+    # Add mean lines
+    plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
+    plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
+    plt.legend()
+    plt.savefig(output_plots / "keyword_density_scatter_opt.png", bbox_inches='tight', dpi=300)
+    plt.close()
+    # Save comprehensive statistics
+    print("\n9️⃣ Saving analysis results...")
+    stats_dir = output_dir / "stats"
+    stats_dir.mkdir(parents=True, exist_ok=True)
+    with open(stats_dir / "treatment_analysis_comprehensive_opt.json", 'w', encoding='utf-8') as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+    print(f"✅ Treatment subset analysis complete! (Optimized Version)")
+    print(f"   Results saved to: {output_dir}")
+    print(f"   Plots: {output_plots}")
+    print(f"   Statistics: {stats_dir}")
+    return stats
+if __name__ == "__main__":
+    # Configuration for optimized version
+    treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
+    emergency_keywords = "../keywords/emergency_keywords.txt"
+    treatment_keywords = "../keywords/treatment_keywords.txt"
+    output_directory = "../analysis_treatment_opt"
+    # Run analysis
+    results = analyze_treatment_subset(
+        treatment_file,
+        emergency_keywords,
+        treatment_keywords,
+        output_directory
+    )

dataset/scripts/keyword_Match_Clean_for_subset_filter.txt ADDED Viewed

	@@ -0,0 +1,85 @@

+# Keyword Matching and Text Cleaning Logic for Subset Filtering
+## 1. Keyword Preprocessing
+```python
+def preprocess_keywords(keywords_file):
+    # Handle special medical term variants
+    special_terms = {
+        'x-ray': ['x-ray', 'x ray', 'xray'],
+        'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
+        'point-of-care': ['point-of-care', 'point of care']
+    }
+    # Read and preprocess keywords
+    with open(keywords_file, "r", encoding="utf-8") as f:
+        keywords = [
+            line.strip()  # Remove whitespace
+            .lower()      # Convert to lowercase
+            for line in f
+            if line.strip()
+        ]
+    # Process special term variants
+    processed_keywords = []
+    for kw in keywords:
+        if kw in special_terms:
+            processed_keywords.extend(special_terms[kw])
+        else:
+            processed_keywords.append(kw)
+    return processed_keywords
+```
+## 2. Regex Pattern Processing
+```python
+def create_regex_pattern(keywords):
+    # Simple word boundary matching
+    pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
+    return re.compile(pattern, re.IGNORECASE)
+```
+### Regex Pattern Explanation:
+- `\b`: Word boundary matching
+- `(?:...)`: Non-capturing group
+- `re.escape()`: Escape special characters
+- `re.IGNORECASE`: Case-insensitive matching
+## 3. Text Preprocessing and Matching
+```python
+# Create lowercase version of text
+df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
+# Match keywords
+df["treatment_matched"] = (
+    df["clean_text_lower"]
+    .apply(lambda text: "|".join(pattern.findall(text)) or "")
+)
+```
+## 4. Processing Logic Details
+### 4.1 Special Term Handling Rationale
+- Common variants in medical literature
+- Maintain semantic consistency
+- Improve matching accuracy
+### 4.2 Regex Matching Strategy
+- Word boundary matching for complete terms
+- Precompiled patterns for performance
+- Case-insensitive matching for flexibility
+### 4.3 Text Preprocessing Steps
+1. Fill null values (fillna)
+2. Convert to lowercase (str.lower)
+3. Create dedicated lowercase column to avoid repeated conversions
+## 5. Output Format
+- matched column: Pipe-separated matched keywords
+- type column: Document type identifier ("emergency" or "treatment")
+- condition column: Reserved for future condition mapping
+## 6. Important Considerations
+1. Regular maintenance required for special term variants
+2. Precompiled regex patterns for performance optimization
+3. Dedicated text preprocessing storage to avoid redundant computations
+4. Maintain consistent column structure between emergency and treatment subsets

dataset/scripts/test_keyword_matching.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import pandas as pd
+import re
+from pathlib import Path
+import json
+def test_special_terms_matching():
+    """
+    Test special medical term matching logic
+    """
+    # Test cases for different scenarios
+    test_cases = {
+        "x-ray variants": [
+            "Patient needs an x-ray of the chest",
+            "Ordered chest xray",
+            "X ray shows pneumonia",
+            "XRAY negative"
+        ],
+        "ct-scan variants": [
+            "CT scan reveals nodule",
+            "CT-scan indicates mass",
+            "Requires ctscan urgently",
+            "CTSCAN of abdomen"
+        ],
+        "point-of-care variants": [
+            "Point-of-care testing needed",
+            "Point of care ultrasound",
+            "POC testing results"
+        ],
+        "mixed cases": [
+            "Ordered both x-ray and CT scan",
+            "XRAY and CTSCAN negative",
+            "Multiple point-of-care tests with x-ray"
+        ],
+        "negative cases": [
+            "No imaging mentioned",
+            "Regular examination only",
+            "Laboratory tests pending"
+        ]
+    }
+    # Special terms dictionary (from keyword_Match_Clean_for_subset_filter.txt)
+    special_terms = {
+        'x-ray': ['x-ray', 'x ray', 'xray'],
+        'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
+        'point-of-care': ['point-of-care', 'point of care']
+    }
+    # Create test DataFrame
+    test_df = pd.DataFrame({
+        'clean_text': [text for cases in test_cases.values() for text in cases],
+        'category': [cat for cat, texts in test_cases.items() for _ in texts]
+    })
+    # Process keywords
+    processed_keywords = []
+    for term, variants in special_terms.items():
+        processed_keywords.extend(variants)
+    # Create regex pattern
+    pattern = r"\b(?:" + "|".join(map(re.escape, processed_keywords)) + r")\b"
+    # Apply matching logic
+    test_df['matched'] = (
+        test_df['clean_text']
+        .fillna("")
+        .str.findall(pattern, flags=re.IGNORECASE)
+        .apply(lambda lst: "|".join(lst) if lst else "")
+    )
+    return test_df
+def test_basic_matching():
+    """
+    Test basic keyword matching functionality
+    """
+    # Basic test cases
+    test_cases = {
+        "simple matches": [
+            "Emergency treatment required",
+            "Acute condition observed",
+            "Urgent care needed"
+        ],
+        "case variations": [
+            "EMERGENCY situation",
+            "Acute RESPIRATORY failure",
+            "URgent surgical intervention"
+        ],
+        "multiple matches": [
+            "Emergency treatment for acute condition",
+            "Urgent care in emergency department",
+            "Acute respiratory emergency"
+        ],
+        "partial words": [
+            "Non-emergency situation",
+            "Subacute condition",
+            "Emergency-related"
+        ]
+    }
+    # Create test DataFrame
+    test_df = pd.DataFrame({
+        'clean_text': [text for cases in test_cases.values() for text in cases],
+        'category': [cat for cat, texts in test_cases.items() for _ in texts]
+    })
+    # Test keywords
+    test_keywords = ['emergency', 'acute', 'urgent']
+    pattern = r"\b(?:" + "|".join(map(re.escape, test_keywords)) + r")\b"
+    # Apply matching logic
+    test_df['matched'] = (
+        test_df['clean_text']
+        .fillna("")
+        .str.findall(pattern, flags=re.IGNORECASE)
+        .apply(lambda lst: "|".join(lst) if lst else "")
+    )
+    return test_df
+def save_test_results(results_dict):
+    """
+    Save test results to JSON file
+    """
+    output_dir = Path("../analysis")
+    output_dir.mkdir(exist_ok=True)
+    output_file = output_dir / "keyword_matching_test_results.json"
+    # Convert DataFrame results to dictionary
+    for key, df in results_dict.items():
+        results_dict[key] = df.to_dict(orient='records')
+    with open(output_file, 'w') as f:
+        json.dump(results_dict, f, indent=2)
+    print(f"Results saved to: {output_file}")
+def run_tests():
+    """
+    Run all tests and output results
+    """
+    print("🧪 Running keyword matching tests...")
+    # Run tests
+    special_terms_results = test_special_terms_matching()
+    basic_matching_results = test_basic_matching()
+    # Print results
+    print("\n📊 Special Terms Matching Results:")
+    for category in special_terms_results['category'].unique():
+        print(f"\n{category}:")
+        subset = special_terms_results[special_terms_results['category'] == category]
+        for _, row in subset.iterrows():
+            print(f"Text: {row['clean_text']}")
+            print(f"Matched: {row['matched'] or 'No matches'}")
+            print("-" * 50)
+    print("\n📊 Basic Matching Results:")
+    for category in basic_matching_results['category'].unique():
+        print(f"\n{category}:")
+        subset = basic_matching_results[basic_matching_results['category'] == category]
+        for _, row in subset.iterrows():
+            print(f"Text: {row['clean_text']}")
+            print(f"Matched: {row['matched'] or 'No matches'}")
+            print("-" * 50)
+    # Save results
+    results_dict = {
+        'special_terms_matching': special_terms_results,
+        'basic_matching': basic_matching_results
+    }
+    save_test_results(results_dict)
+if __name__ == "__main__":
+    run_tests()

requirements.txt CHANGED Viewed

@@ -10,12 +10,15 @@ Brotli==1.1.0
 certifi==2025.7.14
 charset-normalizer==3.4.2
 click==8.2.1
 datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
 fastapi==0.116.1
 ffmpy==0.6.0
 filelock==3.18.0
 frozenlist==1.7.0
 fsspec==2025.3.0
 gradio==5.38.0
@@ -29,8 +32,10 @@ huggingface-hub==0.33.4
 idna==3.10
 Jinja2==3.1.6
 jiter==0.10.0
 markdown-it-py==3.0.0
 MarkupSafe==3.0.2
 mdurl==0.1.2
 multidict==6.6.3
 multiprocess==0.70.16
@@ -46,6 +51,7 @@ pydantic==2.11.7
 pydantic_core==2.33.2
 pydub==0.25.1
 Pygments==2.19.2
 python-dateutil==2.9.0.post0
 python-multipart==0.0.20
 pytz==2025.2
@@ -56,6 +62,7 @@ rich==14.0.0
 ruff==0.12.4
 safehttpx==0.1.6
 safetensors==0.5.3
 semantic-version==2.10.0
 shellingham==1.5.4
 six==1.17.0

 certifi==2025.7.14
 charset-normalizer==3.4.2
 click==8.2.1
+contourpy==1.3.2
+cycler==0.12.1
 datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
 fastapi==0.116.1
 ffmpy==0.6.0
 filelock==3.18.0
+fonttools==4.59.0
 frozenlist==1.7.0
 fsspec==2025.3.0
 gradio==5.38.0
 idna==3.10
 Jinja2==3.1.6
 jiter==0.10.0
+kiwisolver==1.4.8
 markdown-it-py==3.0.0
 MarkupSafe==3.0.2
+matplotlib==3.10.3
 mdurl==0.1.2
 multidict==6.6.3
 multiprocess==0.70.16
 pydantic_core==2.33.2
 pydub==0.25.1
 Pygments==2.19.2
+pyparsing==3.2.3
 python-dateutil==2.9.0.post0
 python-multipart==0.0.20
 pytz==2025.2
 ruff==0.12.4
 safehttpx==0.1.6
 safetensors==0.5.3
+seaborn==0.13.2
 semantic-version==2.10.0
 shellingham==1.5.4
 six==1.17.0