Spaces:
Sleeping
Sleeping
Merge pull request #1 from YanBoChen0928/dataprocessing
Browse filess/p Data preprocessing, -> go for processing, embedding...
- .gitignore +1 -1
- dataset/analysis/integrity_check/integrity_check_report.json +29 -0
- dataset/analysis/keyword_matching_test_results.json +151 -0
- dataset/analysis/stats/analysis_stats_emergency_subset.json +55 -0
- dataset/analysis/stats/analysis_stats_emergency_subset_opt.json +55 -0
- dataset/analysis/subset_comparison_first10_records_20250726_163149.md +198 -0
- dataset/analysis/subset_comparison_first10_records_20250726_163158.md +198 -0
- dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json +293 -0
- dataset/check_source.py +18 -0
- dataset/filter_guidelines.py +31 -0
- dataset/keywords/emergency_keywords.txt +47 -0
- dataset/keywords/special_terms_emergency.json +26 -0
- dataset/keywords/special_terms_treatment.json +25 -0
- dataset/keywords/treatment_keywords.txt +105 -0
- dataset/scripts/01_filter_emergency.py +58 -0
- dataset/scripts/01_filter_emergency_opt.py +112 -0
- dataset/scripts/02_filter_treatment.py +103 -0
- dataset/scripts/02_filter_treatment_opt.py +131 -0
- dataset/scripts/check_subset_integrity.py +178 -0
- dataset/scripts/commit_message_20250726_special_terms.txt +39 -0
- dataset/scripts/compare_subsets_opt.py +124 -0
- dataset/scripts/data_explorer.py +123 -0
- dataset/scripts/data_explorer_opt.py +118 -0
- dataset/scripts/data_explorer_treatment.py +265 -0
- dataset/scripts/data_explorer_treatment_opt.py +262 -0
- dataset/scripts/keyword_Match_Clean_for_subset_filter.txt +85 -0
- dataset/scripts/test_keyword_matching.py +175 -0
- requirements.txt +7 -0
.gitignore
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
dataset/
|
| 2 |
|
| 3 |
#virtual environment
|
| 4 |
genAIvenv/
|
|
|
|
| 1 |
+
dataset/dataset/
|
| 2 |
|
| 3 |
#virtual environment
|
| 4 |
genAIvenv/
|
dataset/analysis/integrity_check/integrity_check_report.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"sample_analysis": {
|
| 3 |
+
"matched": {
|
| 4 |
+
"non_null": 100,
|
| 5 |
+
"non_empty": 100,
|
| 6 |
+
"unique_values": 84
|
| 7 |
+
},
|
| 8 |
+
"treatment_matched": {
|
| 9 |
+
"non_null": 100,
|
| 10 |
+
"non_empty": 100,
|
| 11 |
+
"unique_values": 100
|
| 12 |
+
}
|
| 13 |
+
},
|
| 14 |
+
"full_file_analysis": {
|
| 15 |
+
"total_records": 9367,
|
| 16 |
+
"matched_column": {
|
| 17 |
+
"non_null_count": 9367,
|
| 18 |
+
"non_empty_count": 9367,
|
| 19 |
+
"null_percentage": 0.0
|
| 20 |
+
},
|
| 21 |
+
"treatment_matched_column": {
|
| 22 |
+
"non_null_count": 9367,
|
| 23 |
+
"non_empty_count": 9367,
|
| 24 |
+
"null_percentage": 0.0
|
| 25 |
+
},
|
| 26 |
+
"both_matched_count": 3315,
|
| 27 |
+
"both_matched_percentage": 35.39019963702359
|
| 28 |
+
}
|
| 29 |
+
}
|
dataset/analysis/keyword_matching_test_results.json
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"special_terms_matching": [
|
| 3 |
+
{
|
| 4 |
+
"clean_text": "Patient needs an x-ray of the chest",
|
| 5 |
+
"category": "x-ray variants",
|
| 6 |
+
"matched": "x-ray"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"clean_text": "Ordered chest xray",
|
| 10 |
+
"category": "x-ray variants",
|
| 11 |
+
"matched": "xray"
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"clean_text": "X ray shows pneumonia",
|
| 15 |
+
"category": "x-ray variants",
|
| 16 |
+
"matched": "X ray"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"clean_text": "XRAY negative",
|
| 20 |
+
"category": "x-ray variants",
|
| 21 |
+
"matched": "XRAY"
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"clean_text": "CT scan reveals nodule",
|
| 25 |
+
"category": "ct-scan variants",
|
| 26 |
+
"matched": "CT scan"
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"clean_text": "CT-scan indicates mass",
|
| 30 |
+
"category": "ct-scan variants",
|
| 31 |
+
"matched": "CT-scan"
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"clean_text": "Requires ctscan urgently",
|
| 35 |
+
"category": "ct-scan variants",
|
| 36 |
+
"matched": "ctscan"
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"clean_text": "CTSCAN of abdomen",
|
| 40 |
+
"category": "ct-scan variants",
|
| 41 |
+
"matched": "CTSCAN"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"clean_text": "Point-of-care testing needed",
|
| 45 |
+
"category": "point-of-care variants",
|
| 46 |
+
"matched": "Point-of-care"
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"clean_text": "Point of care ultrasound",
|
| 50 |
+
"category": "point-of-care variants",
|
| 51 |
+
"matched": "Point of care"
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"clean_text": "POC testing results",
|
| 55 |
+
"category": "point-of-care variants",
|
| 56 |
+
"matched": ""
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"clean_text": "Ordered both x-ray and CT scan",
|
| 60 |
+
"category": "mixed cases",
|
| 61 |
+
"matched": "x-ray|CT scan"
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"clean_text": "XRAY and CTSCAN negative",
|
| 65 |
+
"category": "mixed cases",
|
| 66 |
+
"matched": "XRAY|CTSCAN"
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"clean_text": "Multiple point-of-care tests with x-ray",
|
| 70 |
+
"category": "mixed cases",
|
| 71 |
+
"matched": "point-of-care|x-ray"
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"clean_text": "No imaging mentioned",
|
| 75 |
+
"category": "negative cases",
|
| 76 |
+
"matched": ""
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"clean_text": "Regular examination only",
|
| 80 |
+
"category": "negative cases",
|
| 81 |
+
"matched": ""
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"clean_text": "Laboratory tests pending",
|
| 85 |
+
"category": "negative cases",
|
| 86 |
+
"matched": ""
|
| 87 |
+
}
|
| 88 |
+
],
|
| 89 |
+
"basic_matching": [
|
| 90 |
+
{
|
| 91 |
+
"clean_text": "Emergency treatment required",
|
| 92 |
+
"category": "simple matches",
|
| 93 |
+
"matched": "Emergency"
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"clean_text": "Acute condition observed",
|
| 97 |
+
"category": "simple matches",
|
| 98 |
+
"matched": "Acute"
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"clean_text": "Urgent care needed",
|
| 102 |
+
"category": "simple matches",
|
| 103 |
+
"matched": "Urgent"
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"clean_text": "EMERGENCY situation",
|
| 107 |
+
"category": "case variations",
|
| 108 |
+
"matched": "EMERGENCY"
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"clean_text": "Acute RESPIRATORY failure",
|
| 112 |
+
"category": "case variations",
|
| 113 |
+
"matched": "Acute"
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"clean_text": "URgent surgical intervention",
|
| 117 |
+
"category": "case variations",
|
| 118 |
+
"matched": "URgent"
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"clean_text": "Emergency treatment for acute condition",
|
| 122 |
+
"category": "multiple matches",
|
| 123 |
+
"matched": "Emergency|acute"
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"clean_text": "Urgent care in emergency department",
|
| 127 |
+
"category": "multiple matches",
|
| 128 |
+
"matched": "Urgent|emergency"
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"clean_text": "Acute respiratory emergency",
|
| 132 |
+
"category": "multiple matches",
|
| 133 |
+
"matched": "Acute|emergency"
|
| 134 |
+
},
|
| 135 |
+
{
|
| 136 |
+
"clean_text": "Non-emergency situation",
|
| 137 |
+
"category": "partial words",
|
| 138 |
+
"matched": "emergency"
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"clean_text": "Subacute condition",
|
| 142 |
+
"category": "partial words",
|
| 143 |
+
"matched": ""
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"clean_text": "Emergency-related",
|
| 147 |
+
"category": "partial words",
|
| 148 |
+
"matched": "Emergency"
|
| 149 |
+
}
|
| 150 |
+
]
|
| 151 |
+
}
|
dataset/analysis/stats/analysis_stats_emergency_subset.json
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"basic_statistics": {
|
| 3 |
+
"total_records": 10282,
|
| 4 |
+
"avg_length": 25185.078194903715
|
| 5 |
+
},
|
| 6 |
+
"keyword_statistics": {
|
| 7 |
+
"Acute abdomen": 52,
|
| 8 |
+
"Acute bleeding": 31,
|
| 9 |
+
"Acute Coronary Syndrome": 345,
|
| 10 |
+
"Acute Kidney Injury": 202,
|
| 11 |
+
"Acute pancreatitis": 214,
|
| 12 |
+
"Acute respiratory distress syndrome": 231,
|
| 13 |
+
"Acute stroke": 67,
|
| 14 |
+
"Anaphylaxis": 1016,
|
| 15 |
+
"Anaphylactic Shock": 153,
|
| 16 |
+
"Arrhythmia": 1547,
|
| 17 |
+
"Atrial fibrillation": 771,
|
| 18 |
+
"Atrial flutter": 146,
|
| 19 |
+
"Bradycardia": 884,
|
| 20 |
+
"Cardiac arrest": 614,
|
| 21 |
+
"Cardiogenic Shock": 196,
|
| 22 |
+
"Chest pain": 1433,
|
| 23 |
+
"Dyspnea": 1319,
|
| 24 |
+
"Fever": 4270,
|
| 25 |
+
"Gastrointestinal Hemorrhage": 158,
|
| 26 |
+
"GI bleeding": 105,
|
| 27 |
+
"Hemorrhage": 1611,
|
| 28 |
+
"Hemorrhagic stroke": 117,
|
| 29 |
+
"Hyperthermia": 305,
|
| 30 |
+
"Hypovolemic Shock": 63,
|
| 31 |
+
"Hypotension": 1929,
|
| 32 |
+
"Hypothermia": 356,
|
| 33 |
+
"Internal bleeding": 70,
|
| 34 |
+
"Intracranial Hemorrhages": 6,
|
| 35 |
+
"Ischemic stroke": 224,
|
| 36 |
+
"Loss of consciousness": 422,
|
| 37 |
+
"Myocardial Infarction": 1708,
|
| 38 |
+
"MI": 10183,
|
| 39 |
+
"Pulmonary Edema": 487,
|
| 40 |
+
"Pulmonary Embolism": 654,
|
| 41 |
+
"Respiratory distress": 730,
|
| 42 |
+
"Respiratory failure": 579,
|
| 43 |
+
"Sepsis": 1181,
|
| 44 |
+
"Severe Sepsis": 81,
|
| 45 |
+
"Septic Shock": 244,
|
| 46 |
+
"Shock": 1881,
|
| 47 |
+
"Status Epilepticus": 150,
|
| 48 |
+
"Syncope": 834,
|
| 49 |
+
"Tachycardia": 1650,
|
| 50 |
+
"Tachypnea": 268,
|
| 51 |
+
"Traumatic Brain Injury": 171,
|
| 52 |
+
"Ventricular Tachycardia": 491,
|
| 53 |
+
"Ventricular fibrillation": 295
|
| 54 |
+
}
|
| 55 |
+
}
|
dataset/analysis/stats/analysis_stats_emergency_subset_opt.json
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"basic_statistics": {
|
| 3 |
+
"total_records": 11914,
|
| 4 |
+
"avg_length": 23847.07579318449
|
| 5 |
+
},
|
| 6 |
+
"keyword_statistics": {
|
| 7 |
+
"Acute abdomen": 52,
|
| 8 |
+
"Acute bleeding": 31,
|
| 9 |
+
"Acute Coronary Syndrome": 351,
|
| 10 |
+
"Acute Kidney Injury": 202,
|
| 11 |
+
"Acute pancreatitis": 214,
|
| 12 |
+
"Acute respiratory distress syndrome": 231,
|
| 13 |
+
"Acute stroke": 67,
|
| 14 |
+
"Anaphylaxis": 1016,
|
| 15 |
+
"Anaphylactic Shock": 153,
|
| 16 |
+
"Arrhythmia": 1564,
|
| 17 |
+
"Atrial fibrillation": 771,
|
| 18 |
+
"Atrial flutter": 146,
|
| 19 |
+
"Bradycardia": 884,
|
| 20 |
+
"Cardiac arrest": 614,
|
| 21 |
+
"Cardiogenic Shock": 196,
|
| 22 |
+
"Chest pain": 1434,
|
| 23 |
+
"Dyspnea": 1319,
|
| 24 |
+
"Fever": 4279,
|
| 25 |
+
"Gastrointestinal Hemorrhage": 158,
|
| 26 |
+
"GI bleeding": 105,
|
| 27 |
+
"Hemorrhage": 1621,
|
| 28 |
+
"Hemorrhagic stroke": 117,
|
| 29 |
+
"Hyperthermia": 305,
|
| 30 |
+
"Hypovolemic Shock": 63,
|
| 31 |
+
"Hypotension": 1929,
|
| 32 |
+
"Hypothermia": 356,
|
| 33 |
+
"Internal bleeding": 70,
|
| 34 |
+
"Intracranial Hemorrhages": 6,
|
| 35 |
+
"Ischemic stroke": 225,
|
| 36 |
+
"Loss of consciousness": 422,
|
| 37 |
+
"Myocardial Infarction": 1710,
|
| 38 |
+
"MI": 11773,
|
| 39 |
+
"Pulmonary Edema": 487,
|
| 40 |
+
"Pulmonary Embolism": 654,
|
| 41 |
+
"Respiratory distress": 730,
|
| 42 |
+
"Respiratory failure": 579,
|
| 43 |
+
"Sepsis": 1188,
|
| 44 |
+
"Severe Sepsis": 81,
|
| 45 |
+
"Septic Shock": 244,
|
| 46 |
+
"Shock": 1892,
|
| 47 |
+
"Status Epilepticus": 150,
|
| 48 |
+
"Syncope": 834,
|
| 49 |
+
"Tachycardia": 1651,
|
| 50 |
+
"Tachypnea": 268,
|
| 51 |
+
"Traumatic Brain Injury": 171,
|
| 52 |
+
"Ventricular Tachycardia": 492,
|
| 53 |
+
"Ventricular fibrillation": 295
|
| 54 |
+
}
|
| 55 |
+
}
|
dataset/analysis/subset_comparison_first10_records_20250726_163149.md
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Optimized Subsets Comparison Report
|
| 2 |
+
|
| 3 |
+
Generated on: 2025-07-26 16:31:49
|
| 4 |
+
|
| 5 |
+
File format: CSV
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
## Basic Statistics
|
| 9 |
+
|
| 10 |
+
- Emergency subset total records: 11914
|
| 11 |
+
- Emergency+Treatment subset total records: 11023
|
| 12 |
+
- Avg Emergency Text Length: 23847.08
|
| 13 |
+
- Avg Treatment Text Length: 25408.64
|
| 14 |
+
- Avg Emergency Keywords: 2.85
|
| 15 |
+
- Avg Treatment Keywords: 2.97
|
| 16 |
+
|
| 17 |
+
## Emergency Subset (First 10 Records)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
### Record 1
|
| 21 |
+
```
|
| 22 |
+
Text preview: # Section 1: Recommendations
|
| 23 |
+
|
| 24 |
+
# RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
|
| 25 |
+
Matched keywords: shock
|
| 26 |
+
Text length: 37792
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
### Record 2
|
| 31 |
+
```
|
| 32 |
+
Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
|
| 33 |
+
Matched keywords: hemorrhage
|
| 34 |
+
Text length: 7559
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
### Record 3
|
| 39 |
+
```
|
| 40 |
+
Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
|
| 41 |
+
Matched keywords: ards|pulmonary embolism
|
| 42 |
+
Text length: 11731
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
### Record 4
|
| 47 |
+
```
|
| 48 |
+
Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
|
| 49 |
+
Matched keywords: fever|dyspnea|hypotension|sepsis
|
| 50 |
+
Text length: 46087
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
### Record 5
|
| 55 |
+
```
|
| 56 |
+
Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
|
| 57 |
+
Matched keywords: hyperthermia
|
| 58 |
+
Text length: 35302
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
### Record 6
|
| 63 |
+
```
|
| 64 |
+
Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
|
| 65 |
+
Matched keywords: hemorrhage|dyspnea
|
| 66 |
+
Text length: 16186
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
### Record 7
|
| 71 |
+
```
|
| 72 |
+
Text preview: # GUIDELINE OBJECTIVES
|
| 73 |
+
The objective of this guideline is to update a previous guideline on chemothe...
|
| 74 |
+
Matched keywords: hemorrhage
|
| 75 |
+
Text length: 7551
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
### Record 8
|
| 80 |
+
```
|
| 81 |
+
Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
|
| 82 |
+
Matched keywords: mi
|
| 83 |
+
Text length: 50729
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
### Record 9
|
| 88 |
+
```
|
| 89 |
+
Text preview: # GUIDELINE OBJECTIVE
|
| 90 |
+
This guideline was written to provide guidance on the most appropriate follow-...
|
| 91 |
+
Matched keywords: hemorrhage
|
| 92 |
+
Text length: 4299
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
### Record 10
|
| 97 |
+
```
|
| 98 |
+
Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
|
| 99 |
+
Matched keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
|
| 100 |
+
Text length: 54427
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
## Emergency+Treatment Subset (First 10 Records)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
### Record 1
|
| 108 |
+
```
|
| 109 |
+
Text preview: # Section 1: Recommendations
|
| 110 |
+
|
| 111 |
+
# RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
|
| 112 |
+
Emergency keywords: shock
|
| 113 |
+
Treatment keywords: management|medication|procedure|fluid|monitoring|iv|administer|dose
|
| 114 |
+
Text length: 37792
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
### Record 2
|
| 119 |
+
```
|
| 120 |
+
Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
|
| 121 |
+
Emergency keywords: hemorrhage
|
| 122 |
+
Treatment keywords: Therapy|treatment|x-ray|us|ct
|
| 123 |
+
Text length: 7559
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
### Record 3
|
| 128 |
+
```
|
| 129 |
+
Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
|
| 130 |
+
Emergency keywords: ards|pulmonary embolism
|
| 131 |
+
Treatment keywords: dopamine|therapy|treatment|surgery|iv|intervention|dose
|
| 132 |
+
Text length: 11731
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
### Record 4
|
| 137 |
+
```
|
| 138 |
+
Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
|
| 139 |
+
Emergency keywords: fever|dyspnea|hypotension|sepsis
|
| 140 |
+
Treatment keywords: treatment|iv|therapy|treat|management|intervention|supportive care|dose
|
| 141 |
+
Text length: 46087
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
### Record 5
|
| 146 |
+
```
|
| 147 |
+
Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
|
| 148 |
+
Emergency keywords: hyperthermia
|
| 149 |
+
Treatment keywords: surgery|treatment|therapy|treat|dose|ct
|
| 150 |
+
Text length: 35302
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
### Record 6
|
| 155 |
+
```
|
| 156 |
+
Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
|
| 157 |
+
Emergency keywords: hemorrhage|dyspnea
|
| 158 |
+
Treatment keywords: therapy|management|treatment|morphine|dose
|
| 159 |
+
Text length: 16186
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
### Record 7
|
| 164 |
+
```
|
| 165 |
+
Text preview: # GUIDELINE OBJECTIVES
|
| 166 |
+
The objective of this guideline is to update a previous guideline on chemothe...
|
| 167 |
+
Emergency keywords: hemorrhage
|
| 168 |
+
Treatment keywords: therapy|treatment|surgery
|
| 169 |
+
Text length: 7551
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
### Record 8
|
| 174 |
+
```
|
| 175 |
+
Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
|
| 176 |
+
Emergency keywords: mi
|
| 177 |
+
Treatment keywords: iv|Dose|therapy|administer|surgery|treatment|treat|medication|ecg
|
| 178 |
+
Text length: 50729
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
### Record 9
|
| 183 |
+
```
|
| 184 |
+
Text preview: # GUIDELINE OBJECTIVE
|
| 185 |
+
This guideline was written to provide guidance on the most appropriate follow-...
|
| 186 |
+
Emergency keywords: hemorrhage
|
| 187 |
+
Treatment keywords: treatment|ct
|
| 188 |
+
Text length: 4299
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
### Record 10
|
| 193 |
+
```
|
| 194 |
+
Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
|
| 195 |
+
Emergency keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
|
| 196 |
+
Treatment keywords: treatment|oxygen|iv|dose|therapy|surgery|x-ray|administer|procedure|management
|
| 197 |
+
Text length: 54427
|
| 198 |
+
```
|
dataset/analysis/subset_comparison_first10_records_20250726_163158.md
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Optimized Subsets Comparison Report
|
| 2 |
+
|
| 3 |
+
Generated on: 2025-07-26 16:31:58
|
| 4 |
+
|
| 5 |
+
File format: JSONL
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
## Basic Statistics
|
| 9 |
+
|
| 10 |
+
- Emergency subset total records: 11914
|
| 11 |
+
- Emergency+Treatment subset total records: 11023
|
| 12 |
+
- Avg Emergency Text Length: 23847.08
|
| 13 |
+
- Avg Treatment Text Length: 25408.64
|
| 14 |
+
- Avg Emergency Keywords: 2.85
|
| 15 |
+
- Avg Treatment Keywords: 2.97
|
| 16 |
+
|
| 17 |
+
## Emergency Subset (First 10 Records)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
### Record 1
|
| 21 |
+
```
|
| 22 |
+
Text preview: # Section 1: Recommendations
|
| 23 |
+
|
| 24 |
+
# RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
|
| 25 |
+
Matched keywords: shock
|
| 26 |
+
Text length: 37792
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
### Record 2
|
| 31 |
+
```
|
| 32 |
+
Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
|
| 33 |
+
Matched keywords: hemorrhage
|
| 34 |
+
Text length: 7559
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
### Record 3
|
| 39 |
+
```
|
| 40 |
+
Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
|
| 41 |
+
Matched keywords: ards|pulmonary embolism
|
| 42 |
+
Text length: 11731
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
### Record 4
|
| 47 |
+
```
|
| 48 |
+
Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
|
| 49 |
+
Matched keywords: fever|dyspnea|hypotension|sepsis
|
| 50 |
+
Text length: 46087
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
### Record 5
|
| 55 |
+
```
|
| 56 |
+
Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
|
| 57 |
+
Matched keywords: hyperthermia
|
| 58 |
+
Text length: 35302
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
### Record 6
|
| 63 |
+
```
|
| 64 |
+
Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
|
| 65 |
+
Matched keywords: hemorrhage|dyspnea
|
| 66 |
+
Text length: 16186
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
### Record 7
|
| 71 |
+
```
|
| 72 |
+
Text preview: # GUIDELINE OBJECTIVES
|
| 73 |
+
The objective of this guideline is to update a previous guideline on chemothe...
|
| 74 |
+
Matched keywords: hemorrhage
|
| 75 |
+
Text length: 7551
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
### Record 8
|
| 80 |
+
```
|
| 81 |
+
Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
|
| 82 |
+
Matched keywords: mi
|
| 83 |
+
Text length: 50729
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
### Record 9
|
| 88 |
+
```
|
| 89 |
+
Text preview: # GUIDELINE OBJECTIVE
|
| 90 |
+
This guideline was written to provide guidance on the most appropriate follow-...
|
| 91 |
+
Matched keywords: hemorrhage
|
| 92 |
+
Text length: 4299
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
### Record 10
|
| 97 |
+
```
|
| 98 |
+
Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
|
| 99 |
+
Matched keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
|
| 100 |
+
Text length: 54427
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
## Emergency+Treatment Subset (First 10 Records)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
### Record 1
|
| 108 |
+
```
|
| 109 |
+
Text preview: # Section 1: Recommendations
|
| 110 |
+
|
| 111 |
+
# RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
|
| 112 |
+
Emergency keywords: shock
|
| 113 |
+
Treatment keywords: management|medication|procedure|fluid|monitoring|iv|administer|dose
|
| 114 |
+
Text length: 37792
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
### Record 2
|
| 119 |
+
```
|
| 120 |
+
Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
|
| 121 |
+
Emergency keywords: hemorrhage
|
| 122 |
+
Treatment keywords: Therapy|treatment|x-ray|us|ct
|
| 123 |
+
Text length: 7559
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
### Record 3
|
| 128 |
+
```
|
| 129 |
+
Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
|
| 130 |
+
Emergency keywords: ards|pulmonary embolism
|
| 131 |
+
Treatment keywords: dopamine|therapy|treatment|surgery|iv|intervention|dose
|
| 132 |
+
Text length: 11731
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
### Record 4
|
| 137 |
+
```
|
| 138 |
+
Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
|
| 139 |
+
Emergency keywords: fever|dyspnea|hypotension|sepsis
|
| 140 |
+
Treatment keywords: treatment|iv|therapy|treat|management|intervention|supportive care|dose
|
| 141 |
+
Text length: 46087
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
### Record 5
|
| 146 |
+
```
|
| 147 |
+
Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
|
| 148 |
+
Emergency keywords: hyperthermia
|
| 149 |
+
Treatment keywords: surgery|treatment|therapy|treat|dose|ct
|
| 150 |
+
Text length: 35302
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
### Record 6
|
| 155 |
+
```
|
| 156 |
+
Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
|
| 157 |
+
Emergency keywords: hemorrhage|dyspnea
|
| 158 |
+
Treatment keywords: therapy|management|treatment|morphine|dose
|
| 159 |
+
Text length: 16186
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
### Record 7
|
| 164 |
+
```
|
| 165 |
+
Text preview: # GUIDELINE OBJECTIVES
|
| 166 |
+
The objective of this guideline is to update a previous guideline on chemothe...
|
| 167 |
+
Emergency keywords: hemorrhage
|
| 168 |
+
Treatment keywords: therapy|treatment|surgery
|
| 169 |
+
Text length: 7551
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
### Record 8
|
| 174 |
+
```
|
| 175 |
+
Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
|
| 176 |
+
Emergency keywords: mi
|
| 177 |
+
Treatment keywords: iv|Dose|therapy|administer|surgery|treatment|treat|medication|ecg
|
| 178 |
+
Text length: 50729
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
### Record 9
|
| 183 |
+
```
|
| 184 |
+
Text preview: # GUIDELINE OBJECTIVE
|
| 185 |
+
This guideline was written to provide guidance on the most appropriate follow-...
|
| 186 |
+
Emergency keywords: hemorrhage
|
| 187 |
+
Treatment keywords: treatment|ct
|
| 188 |
+
Text length: 4299
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
### Record 10
|
| 193 |
+
```
|
| 194 |
+
Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
|
| 195 |
+
Emergency keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
|
| 196 |
+
Treatment keywords: treatment|oxygen|iv|dose|therapy|surgery|x-ray|administer|procedure|management
|
| 197 |
+
Text length: 54427
|
| 198 |
+
```
|
dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"basic_statistics": {
|
| 3 |
+
"total_records": 9367,
|
| 4 |
+
"avg_text_length": 27179.22952919825,
|
| 5 |
+
"emergency_keywords_count": 47,
|
| 6 |
+
"treatment_keywords_count": 105
|
| 7 |
+
},
|
| 8 |
+
"emergency_keyword_stats": {
|
| 9 |
+
"Acute abdomen": 51,
|
| 10 |
+
"Acute bleeding": 31,
|
| 11 |
+
"Acute Coronary Syndrome": 332,
|
| 12 |
+
"Acute Kidney Injury": 200,
|
| 13 |
+
"Acute pancreatitis": 202,
|
| 14 |
+
"Acute respiratory distress syndrome": 225,
|
| 15 |
+
"Acute stroke": 65,
|
| 16 |
+
"Anaphylaxis": 1002,
|
| 17 |
+
"Anaphylactic Shock": 148,
|
| 18 |
+
"Arrhythmia": 1490,
|
| 19 |
+
"Atrial fibrillation": 736,
|
| 20 |
+
"Atrial flutter": 139,
|
| 21 |
+
"Bradycardia": 845,
|
| 22 |
+
"Cardiac arrest": 600,
|
| 23 |
+
"Cardiogenic Shock": 192,
|
| 24 |
+
"Chest pain": 1408,
|
| 25 |
+
"Dyspnea": 1296,
|
| 26 |
+
"Fever": 4008,
|
| 27 |
+
"Gastrointestinal Hemorrhage": 158,
|
| 28 |
+
"GI bleeding": 103,
|
| 29 |
+
"Hemorrhage": 1532,
|
| 30 |
+
"Hemorrhagic stroke": 109,
|
| 31 |
+
"Hyperthermia": 283,
|
| 32 |
+
"Hypovolemic Shock": 61,
|
| 33 |
+
"Hypotension": 1897,
|
| 34 |
+
"Hypothermia": 340,
|
| 35 |
+
"Internal bleeding": 67,
|
| 36 |
+
"Intracranial Hemorrhages": 5,
|
| 37 |
+
"Ischemic stroke": 216,
|
| 38 |
+
"Loss of consciousness": 406,
|
| 39 |
+
"Myocardial Infarction": 1607,
|
| 40 |
+
"MI": 9316,
|
| 41 |
+
"Pulmonary Edema": 471,
|
| 42 |
+
"Pulmonary Embolism": 624,
|
| 43 |
+
"Respiratory distress": 713,
|
| 44 |
+
"Respiratory failure": 554,
|
| 45 |
+
"Sepsis": 1145,
|
| 46 |
+
"Severe Sepsis": 81,
|
| 47 |
+
"Septic Shock": 231,
|
| 48 |
+
"Shock": 1702,
|
| 49 |
+
"Status Epilepticus": 149,
|
| 50 |
+
"Syncope": 806,
|
| 51 |
+
"Tachycardia": 1576,
|
| 52 |
+
"Tachypnea": 262,
|
| 53 |
+
"Traumatic Brain Injury": 151,
|
| 54 |
+
"Ventricular Tachycardia": 461,
|
| 55 |
+
"Ventricular fibrillation": 280
|
| 56 |
+
},
|
| 57 |
+
"treatment_keyword_stats": {
|
| 58 |
+
"ACLS": 30,
|
| 59 |
+
"administer": 3881,
|
| 60 |
+
"Adrenaline": 135,
|
| 61 |
+
"Advanced Cardiac Life Support": 34,
|
| 62 |
+
"Airway Management": 174,
|
| 63 |
+
"alpha blocker": 35,
|
| 64 |
+
"Amiodarone": 315,
|
| 65 |
+
"analgesia": 323,
|
| 66 |
+
"Anesthesia Procedural": 0,
|
| 67 |
+
"Anti-Bacterial Agents": 1,
|
| 68 |
+
"antibiotic": 1922,
|
| 69 |
+
"arterial line placement": 0,
|
| 70 |
+
"beta blocker": 297,
|
| 71 |
+
"Bi-level Positive Airway Pressure": 6,
|
| 72 |
+
"bipap": 25,
|
| 73 |
+
"Blood Transfusion": 379,
|
| 74 |
+
"Bosmin": 0,
|
| 75 |
+
"Cardiopulmonary Resuscitation": 131,
|
| 76 |
+
"Cardioversion": 142,
|
| 77 |
+
"Catheterization Arterial": 0,
|
| 78 |
+
"Catheterization Central Venous": 0,
|
| 79 |
+
"central line placement": 6,
|
| 80 |
+
"compression dressing": 2,
|
| 81 |
+
"Computed Tomography": 518,
|
| 82 |
+
"cpap": 84,
|
| 83 |
+
"cpr": 151,
|
| 84 |
+
"crystalloids": 45,
|
| 85 |
+
"ct scan": 1036,
|
| 86 |
+
"Defibrillation": 96,
|
| 87 |
+
"Dopamine": 389,
|
| 88 |
+
"Dosage Forms": 210,
|
| 89 |
+
"dose": 5344,
|
| 90 |
+
"Drug Administration Routes": 0,
|
| 91 |
+
"Drug Therapy": 773,
|
| 92 |
+
"Epinephrine": 806,
|
| 93 |
+
"fluid": 2938,
|
| 94 |
+
"fluid resuscitation": 115,
|
| 95 |
+
"hemodynamic monitoring": 43,
|
| 96 |
+
"Hemodynamics": 135,
|
| 97 |
+
"Hemostasis": 180,
|
| 98 |
+
"Ibuprofen": 269,
|
| 99 |
+
"icu transfer": 9,
|
| 100 |
+
"Insulin": 808,
|
| 101 |
+
"intervention": 2695,
|
| 102 |
+
"intubation": 493,
|
| 103 |
+
"Intratracheal Intubation": 3,
|
| 104 |
+
"Intravenous Infusion": 576,
|
| 105 |
+
"iv fluids": 75,
|
| 106 |
+
"laboratory techniques": 29,
|
| 107 |
+
"laboratory testing": 296,
|
| 108 |
+
"levophed": 11,
|
| 109 |
+
"Lidocaine": 212,
|
| 110 |
+
"manage": 4416,
|
| 111 |
+
"management": 4008,
|
| 112 |
+
"medication": 4698,
|
| 113 |
+
"midazolam": 204,
|
| 114 |
+
"monitor": 4521,
|
| 115 |
+
"monitoring": 3593,
|
| 116 |
+
"Morphine": 289,
|
| 117 |
+
"Nebulization": 41,
|
| 118 |
+
"nitroglycerin": 125,
|
| 119 |
+
"NTG": 81,
|
| 120 |
+
"Norepinephrine": 392,
|
| 121 |
+
"normal saline": 252,
|
| 122 |
+
"Ondansetron": 43,
|
| 123 |
+
"Oxygen": 1779,
|
| 124 |
+
"Oxygen Inhalation Therapy": 2,
|
| 125 |
+
"oxygen therapy": 178,
|
| 126 |
+
"Patient Management": 281,
|
| 127 |
+
"Patient Monitoring": 107,
|
| 128 |
+
"POCUS": 10,
|
| 129 |
+
"point of care ultrasound": 2,
|
| 130 |
+
"procedural sedation": 26,
|
| 131 |
+
"procedure": 3073,
|
| 132 |
+
"radiologic imaging": 5,
|
| 133 |
+
"Radiography": 218,
|
| 134 |
+
"resuscitation": 539,
|
| 135 |
+
"Sedation": 602,
|
| 136 |
+
"splinting": 26,
|
| 137 |
+
"Splints": 29,
|
| 138 |
+
"supportive care": 564,
|
| 139 |
+
"surgical procedures": 482,
|
| 140 |
+
"Surgical Procedures Operative": 0,
|
| 141 |
+
"surgery": 3531,
|
| 142 |
+
"Suture": 179,
|
| 143 |
+
"Suturing": 53,
|
| 144 |
+
"Therapeutic Intervention": 181,
|
| 145 |
+
"Therapeutics": 182,
|
| 146 |
+
"Therapy": 6117,
|
| 147 |
+
"tourniquet": 56,
|
| 148 |
+
"transfusion": 826,
|
| 149 |
+
"treat": 8270,
|
| 150 |
+
"treatment": 7719,
|
| 151 |
+
"Ultrasonography Point of Care": 0,
|
| 152 |
+
"ultrasound": 1273,
|
| 153 |
+
"Vasoconstrictor Agents": 2,
|
| 154 |
+
"vasopressors": 188,
|
| 155 |
+
"ventilation support": 14,
|
| 156 |
+
"Ventilators": 86,
|
| 157 |
+
"Vital Signs": 459,
|
| 158 |
+
"vital signs monitoring": 1,
|
| 159 |
+
"wound care": 73,
|
| 160 |
+
"Wound Dressing": 30,
|
| 161 |
+
"Wound Management": 37,
|
| 162 |
+
"X-Ray": 1293
|
| 163 |
+
},
|
| 164 |
+
"cooccurrence_analysis": [
|
| 165 |
+
{
|
| 166 |
+
"emergency_keyword": "Fever",
|
| 167 |
+
"treatment_keyword": "treatment",
|
| 168 |
+
"cooccurrence_count": 3488,
|
| 169 |
+
"percentage": 37.23710899967973
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
"emergency_keyword": "Fever",
|
| 173 |
+
"treatment_keyword": "Therapy",
|
| 174 |
+
"cooccurrence_count": 2698,
|
| 175 |
+
"percentage": 28.803245436105477
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
"emergency_keyword": "Fever",
|
| 179 |
+
"treatment_keyword": "dose",
|
| 180 |
+
"cooccurrence_count": 2430,
|
| 181 |
+
"percentage": 25.94213729048788
|
| 182 |
+
},
|
| 183 |
+
{
|
| 184 |
+
"emergency_keyword": "Fever",
|
| 185 |
+
"treatment_keyword": "medication",
|
| 186 |
+
"cooccurrence_count": 1979,
|
| 187 |
+
"percentage": 21.127362015586634
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"emergency_keyword": "Hypotension",
|
| 191 |
+
"treatment_keyword": "treatment",
|
| 192 |
+
"cooccurrence_count": 1760,
|
| 193 |
+
"percentage": 18.789366926443897
|
| 194 |
+
},
|
| 195 |
+
{
|
| 196 |
+
"emergency_keyword": "Fever",
|
| 197 |
+
"treatment_keyword": "management",
|
| 198 |
+
"cooccurrence_count": 1753,
|
| 199 |
+
"percentage": 18.714636489804633
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"emergency_keyword": "Fever",
|
| 203 |
+
"treatment_keyword": "treat",
|
| 204 |
+
"cooccurrence_count": 1744,
|
| 205 |
+
"percentage": 18.618554499839863
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"emergency_keyword": "Fever",
|
| 209 |
+
"treatment_keyword": "monitoring",
|
| 210 |
+
"cooccurrence_count": 1674,
|
| 211 |
+
"percentage": 17.87125013344721
|
| 212 |
+
},
|
| 213 |
+
{
|
| 214 |
+
"emergency_keyword": "Hypotension",
|
| 215 |
+
"treatment_keyword": "Therapy",
|
| 216 |
+
"cooccurrence_count": 1558,
|
| 217 |
+
"percentage": 16.63286004056795
|
| 218 |
+
},
|
| 219 |
+
{
|
| 220 |
+
"emergency_keyword": "Fever",
|
| 221 |
+
"treatment_keyword": "surgery",
|
| 222 |
+
"cooccurrence_count": 1505,
|
| 223 |
+
"percentage": 16.06704387744208
|
| 224 |
+
},
|
| 225 |
+
{
|
| 226 |
+
"emergency_keyword": "Tachycardia",
|
| 227 |
+
"treatment_keyword": "treatment",
|
| 228 |
+
"cooccurrence_count": 1441,
|
| 229 |
+
"percentage": 15.383794171025942
|
| 230 |
+
},
|
| 231 |
+
{
|
| 232 |
+
"emergency_keyword": "Hypotension",
|
| 233 |
+
"treatment_keyword": "dose",
|
| 234 |
+
"cooccurrence_count": 1423,
|
| 235 |
+
"percentage": 15.191630191096403
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"emergency_keyword": "Myocardial Infarction",
|
| 239 |
+
"treatment_keyword": "treatment",
|
| 240 |
+
"cooccurrence_count": 1369,
|
| 241 |
+
"percentage": 14.615138251307783
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"emergency_keyword": "Shock",
|
| 245 |
+
"treatment_keyword": "treatment",
|
| 246 |
+
"cooccurrence_count": 1340,
|
| 247 |
+
"percentage": 14.305540728087967
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
"emergency_keyword": "Fever",
|
| 251 |
+
"treatment_keyword": "fluid",
|
| 252 |
+
"cooccurrence_count": 1330,
|
| 253 |
+
"percentage": 14.198782961460447
|
| 254 |
+
},
|
| 255 |
+
{
|
| 256 |
+
"emergency_keyword": "Hemorrhage",
|
| 257 |
+
"treatment_keyword": "treatment",
|
| 258 |
+
"cooccurrence_count": 1328,
|
| 259 |
+
"percentage": 14.177431408134941
|
| 260 |
+
},
|
| 261 |
+
{
|
| 262 |
+
"emergency_keyword": "Hypotension",
|
| 263 |
+
"treatment_keyword": "monitoring",
|
| 264 |
+
"cooccurrence_count": 1325,
|
| 265 |
+
"percentage": 14.145404078146683
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"emergency_keyword": "Tachycardia",
|
| 269 |
+
"treatment_keyword": "Therapy",
|
| 270 |
+
"cooccurrence_count": 1277,
|
| 271 |
+
"percentage": 13.632966798334579
|
| 272 |
+
},
|
| 273 |
+
{
|
| 274 |
+
"emergency_keyword": "Dyspnea",
|
| 275 |
+
"treatment_keyword": "treatment",
|
| 276 |
+
"cooccurrence_count": 1228,
|
| 277 |
+
"percentage": 13.10985374185972
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"emergency_keyword": "Myocardial Infarction",
|
| 281 |
+
"treatment_keyword": "Therapy",
|
| 282 |
+
"cooccurrence_count": 1215,
|
| 283 |
+
"percentage": 12.97106864524394
|
| 284 |
+
}
|
| 285 |
+
],
|
| 286 |
+
"path_b_validation": {
|
| 287 |
+
"avg_emergency_density": 0.3098621434407273,
|
| 288 |
+
"avg_treatment_density": 0.6108515041451529,
|
| 289 |
+
"high_density_records": 1298,
|
| 290 |
+
"precision_estimate": 0.9995729689334899
|
| 291 |
+
},
|
| 292 |
+
"condition_mapping_candidates": {}
|
| 293 |
+
}
|
dataset/check_source.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
# 讀取剛剛下載並過濾後的 JSONL 檔案
|
| 4 |
+
df = pd.read_json("dataset/guidelines_source_filtered.jsonl", lines=True)
|
| 5 |
+
|
| 6 |
+
# 顯示各來源出現次數
|
| 7 |
+
print("📊 各來源出現次數:")
|
| 8 |
+
print(df["source"].value_counts())
|
| 9 |
+
|
| 10 |
+
# 驗證來源是否只有指定的 9 個
|
| 11 |
+
expected_sources = {"cco", "cdc", "cma", "icrc", "nice", "pubmed", "spor", "who", "wikidoc"}
|
| 12 |
+
actual_sources = set(df["source"].unique())
|
| 13 |
+
|
| 14 |
+
# 顯示驗證結果
|
| 15 |
+
if actual_sources == expected_sources:
|
| 16 |
+
print("✅ 來源完全符合預期,沒有其他來源。")
|
| 17 |
+
else:
|
| 18 |
+
print(f"❌ 發現未預期來源:{actual_sources - expected_sources}")
|
dataset/filter_guidelines.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# filter_guidelines.py
|
| 2 |
+
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# ✅ 你信任的來源來源縮寫(Hugging Face dataset 中的 source 欄位)
|
| 8 |
+
approved_sources = ["cco", "cdc", "cma", "icrc", "nice", "pubmed", "spor", "who", "wikidoc"]
|
| 9 |
+
|
| 10 |
+
# Step 1: 從 Hugging Face 載入資料集
|
| 11 |
+
print("⏳ 載入資料中...")
|
| 12 |
+
ds = load_dataset("epfl-llm/guidelines", split="train")
|
| 13 |
+
|
| 14 |
+
# Step 2: 依據 source 欄位進行過濾
|
| 15 |
+
print("🔍 篩選可信來源中...")
|
| 16 |
+
ds_filtered = ds.filter(lambda ex: ex["source"] in approved_sources)
|
| 17 |
+
print(f"✅ 篩選完成,總共 {len(ds_filtered)} 筆資料。")
|
| 18 |
+
|
| 19 |
+
# Step 3: 轉成 pandas DataFrame
|
| 20 |
+
print("📄 轉換為 DataFrame...")
|
| 21 |
+
df = ds_filtered.to_pandas()
|
| 22 |
+
|
| 23 |
+
# Step 4: 建立 dataset 資料夾(如果不存在)
|
| 24 |
+
os.makedirs("dataset", exist_ok=True)
|
| 25 |
+
|
| 26 |
+
# Step 5: 儲存為 JSONL 與 CSV 到 dataset/ 資料夾中
|
| 27 |
+
print("💾 儲存到 dataset/ 資料夾...")
|
| 28 |
+
df.to_json("dataset/guidelines_source_filtered.jsonl", orient="records", lines=True)
|
| 29 |
+
df.to_csv("dataset/guidelines_source_filtered.csv", index=False)
|
| 30 |
+
|
| 31 |
+
print("🎉 完成!已儲存來自可信來源的資料。")
|
dataset/keywords/emergency_keywords.txt
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Acute abdomen
|
| 2 |
+
Acute bleeding
|
| 3 |
+
Acute Coronary Syndrome
|
| 4 |
+
Acute Kidney Injury
|
| 5 |
+
Acute pancreatitis
|
| 6 |
+
Acute respiratory distress syndrome
|
| 7 |
+
Acute stroke
|
| 8 |
+
Anaphylaxis
|
| 9 |
+
Anaphylactic Shock
|
| 10 |
+
Arrhythmia
|
| 11 |
+
Atrial fibrillation
|
| 12 |
+
Atrial flutter
|
| 13 |
+
Bradycardia
|
| 14 |
+
Cardiac arrest
|
| 15 |
+
Cardiogenic Shock
|
| 16 |
+
Chest pain
|
| 17 |
+
Dyspnea
|
| 18 |
+
Fever
|
| 19 |
+
Gastrointestinal Hemorrhage
|
| 20 |
+
GI bleeding
|
| 21 |
+
Hemorrhage
|
| 22 |
+
Hemorrhagic stroke
|
| 23 |
+
Hyperthermia
|
| 24 |
+
Hypovolemic Shock
|
| 25 |
+
Hypotension
|
| 26 |
+
Hypothermia
|
| 27 |
+
Internal bleeding
|
| 28 |
+
Intracranial Hemorrhages
|
| 29 |
+
Ischemic stroke
|
| 30 |
+
Loss of consciousness
|
| 31 |
+
Myocardial Infarction
|
| 32 |
+
MI
|
| 33 |
+
Pulmonary Edema
|
| 34 |
+
Pulmonary Embolism
|
| 35 |
+
Respiratory distress
|
| 36 |
+
Respiratory failure
|
| 37 |
+
Sepsis
|
| 38 |
+
Severe Sepsis
|
| 39 |
+
Septic Shock
|
| 40 |
+
Shock
|
| 41 |
+
Status Epilepticus
|
| 42 |
+
Syncope
|
| 43 |
+
Tachycardia
|
| 44 |
+
Tachypnea
|
| 45 |
+
Traumatic Brain Injury
|
| 46 |
+
Ventricular Tachycardia
|
| 47 |
+
Ventricular fibrillation
|
dataset/keywords/special_terms_emergency.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cardiac": {
|
| 3 |
+
"mi": ["mi", "m.i.", "myocardial infarction", "MI"],
|
| 4 |
+
"acs": ["acs", "ACS", "acute coronary syndrome"]
|
| 5 |
+
},
|
| 6 |
+
"respiratory": {
|
| 7 |
+
"ards": ["ards", "ARDS", "acute respiratory distress syndrome"],
|
| 8 |
+
"respiratory_failure": ["respiratory failure", "resp failure", "RF"]
|
| 9 |
+
},
|
| 10 |
+
"neurological": {
|
| 11 |
+
"loc": ["loc", "LOC", "loss of consciousness"],
|
| 12 |
+
"cva": ["cva", "CVA", "stroke", "cerebrovascular accident"]
|
| 13 |
+
},
|
| 14 |
+
"shock": {
|
| 15 |
+
"shock": ["shock", "circulatory failure"],
|
| 16 |
+
"septic_shock": ["septic shock", "sepsis induced shock"]
|
| 17 |
+
},
|
| 18 |
+
"bleeding": {
|
| 19 |
+
"gi_bleed": ["gi bleed", "gi bleeding", "gastrointestinal hemorrhage", "GI hemorrhage"],
|
| 20 |
+
"hemorrhage": ["hemorrhage", "bleeding", "blood loss"]
|
| 21 |
+
},
|
| 22 |
+
"vital_signs": {
|
| 23 |
+
"hypotension": ["hypotension", "low bp", "low blood pressure"],
|
| 24 |
+
"tachycardia": ["tachycardia", "elevated heart rate", "fast heart rate"]
|
| 25 |
+
}
|
| 26 |
+
}
|
dataset/keywords/special_terms_treatment.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"imaging": {
|
| 3 |
+
"x-ray": ["x-ray", "x ray", "xray", "XR"],
|
| 4 |
+
"ct": ["ct", "ct-scan", "cat scan", "computed tomography"],
|
| 5 |
+
"us": ["us", "u/s", "ultrasound", "sonography"]
|
| 6 |
+
},
|
| 7 |
+
"medications": {
|
| 8 |
+
"iv": ["iv", "i.v.", "intravenous"],
|
| 9 |
+
"im": ["im", "i.m.", "intramuscular"],
|
| 10 |
+
"po": ["po", "p.o.", "per os", "by mouth"]
|
| 11 |
+
},
|
| 12 |
+
"procedures": {
|
| 13 |
+
"cpr": ["cpr", "CPR", "cardiopulmonary resuscitation"],
|
| 14 |
+
"intubation": ["intubation", "ETT", "endotracheal tube"],
|
| 15 |
+
"cardioversion": ["cardioversion", "electrical cardioversion"]
|
| 16 |
+
},
|
| 17 |
+
"monitoring": {
|
| 18 |
+
"ecg": ["ecg", "ekg", "electrocardiogram"],
|
| 19 |
+
"monitoring": ["monitoring", "continuous observation"]
|
| 20 |
+
},
|
| 21 |
+
"ventilation": {
|
| 22 |
+
"bipap": ["bipap", "BiPAP", "bi-level positive airway pressure"],
|
| 23 |
+
"cpap": ["cpap", "CPAP", "continuous positive airway pressure"]
|
| 24 |
+
}
|
| 25 |
+
}
|
dataset/keywords/treatment_keywords.txt
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ACLS
|
| 2 |
+
administer
|
| 3 |
+
Adrenaline
|
| 4 |
+
Advanced Cardiac Life Support
|
| 5 |
+
Airway Management
|
| 6 |
+
alpha blocker
|
| 7 |
+
Amiodarone
|
| 8 |
+
analgesia
|
| 9 |
+
Anesthesia Procedural
|
| 10 |
+
Anti-Bacterial Agents
|
| 11 |
+
antibiotic
|
| 12 |
+
arterial line placement
|
| 13 |
+
beta blocker
|
| 14 |
+
Bi-level Positive Airway Pressure
|
| 15 |
+
bipap
|
| 16 |
+
Blood Transfusion
|
| 17 |
+
Bosmin
|
| 18 |
+
Cardiopulmonary Resuscitation
|
| 19 |
+
Cardioversion
|
| 20 |
+
Catheterization Arterial
|
| 21 |
+
Catheterization Central Venous
|
| 22 |
+
central line placement
|
| 23 |
+
compression dressing
|
| 24 |
+
Computed Tomography
|
| 25 |
+
cpap
|
| 26 |
+
cpr
|
| 27 |
+
crystalloids
|
| 28 |
+
ct scan
|
| 29 |
+
Defibrillation
|
| 30 |
+
Dopamine
|
| 31 |
+
Dosage Forms
|
| 32 |
+
dose
|
| 33 |
+
Drug Administration Routes
|
| 34 |
+
Drug Therapy
|
| 35 |
+
Epinephrine
|
| 36 |
+
fluid
|
| 37 |
+
fluid resuscitation
|
| 38 |
+
hemodynamic monitoring
|
| 39 |
+
Hemodynamics
|
| 40 |
+
Hemostasis
|
| 41 |
+
Ibuprofen
|
| 42 |
+
icu transfer
|
| 43 |
+
Insulin
|
| 44 |
+
intervention
|
| 45 |
+
intubation
|
| 46 |
+
Intratracheal Intubation
|
| 47 |
+
Intravenous Infusion
|
| 48 |
+
iv fluids
|
| 49 |
+
laboratory techniques
|
| 50 |
+
laboratory testing
|
| 51 |
+
levophed
|
| 52 |
+
Lidocaine
|
| 53 |
+
manage
|
| 54 |
+
management
|
| 55 |
+
medication
|
| 56 |
+
midazolam
|
| 57 |
+
monitor
|
| 58 |
+
monitoring
|
| 59 |
+
Morphine
|
| 60 |
+
Nebulization
|
| 61 |
+
nitroglycerin
|
| 62 |
+
NTG
|
| 63 |
+
Norepinephrine
|
| 64 |
+
normal saline
|
| 65 |
+
Ondansetron
|
| 66 |
+
Oxygen
|
| 67 |
+
Oxygen Inhalation Therapy
|
| 68 |
+
oxygen therapy
|
| 69 |
+
Patient Management
|
| 70 |
+
Patient Monitoring
|
| 71 |
+
POCUS
|
| 72 |
+
point of care ultrasound
|
| 73 |
+
procedural sedation
|
| 74 |
+
procedure
|
| 75 |
+
radiologic imaging
|
| 76 |
+
Radiography
|
| 77 |
+
resuscitation
|
| 78 |
+
Sedation
|
| 79 |
+
splinting
|
| 80 |
+
Splints
|
| 81 |
+
supportive care
|
| 82 |
+
surgical procedures
|
| 83 |
+
Surgical Procedures Operative
|
| 84 |
+
surgery
|
| 85 |
+
Suture
|
| 86 |
+
Suturing
|
| 87 |
+
Therapeutic Intervention
|
| 88 |
+
Therapeutics
|
| 89 |
+
Therapy
|
| 90 |
+
tourniquet
|
| 91 |
+
transfusion
|
| 92 |
+
treat
|
| 93 |
+
treatment
|
| 94 |
+
Ultrasonography Point of Care
|
| 95 |
+
ultrasound
|
| 96 |
+
Vasoconstrictor Agents
|
| 97 |
+
vasopressors
|
| 98 |
+
ventilation support
|
| 99 |
+
Ventilators
|
| 100 |
+
Vital Signs
|
| 101 |
+
vital signs monitoring
|
| 102 |
+
wound care
|
| 103 |
+
Wound Dressing
|
| 104 |
+
Wound Management
|
| 105 |
+
X-Ray
|
dataset/scripts/01_filter_emergency.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scripts/01_filter_emergency.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
# Function: Load keywords and print progress
|
| 8 |
+
def load_keywords(path):
|
| 9 |
+
print(f"📥 Loading keywords from: {path}")
|
| 10 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 11 |
+
kws = [line.strip() for line in f if line.strip()]
|
| 12 |
+
print(f" Loaded {len(kws)} keywords")
|
| 13 |
+
return kws
|
| 14 |
+
|
| 15 |
+
# Step 1: Read source data
|
| 16 |
+
print("1️⃣ Reading source data...")
|
| 17 |
+
source_path = "../dataset/guidelines_source_filtered.jsonl"
|
| 18 |
+
df = pd.read_json(source_path, lines=True)
|
| 19 |
+
print(f" Loaded {len(df)} records")
|
| 20 |
+
|
| 21 |
+
# Step 2: Load emergency keywords and match
|
| 22 |
+
print("2️⃣ Loading emergency keywords and matching...")
|
| 23 |
+
keywords = load_keywords("../keywords/emergency_keywords.txt")
|
| 24 |
+
pattern = r"\b(?:" + "|".join(keywords) + r")\b" # Using non-capturing groups (?:...)
|
| 25 |
+
|
| 26 |
+
# Match keywords and add metadata columns
|
| 27 |
+
df["matched"] = (
|
| 28 |
+
df["clean_text"]
|
| 29 |
+
.fillna("") # Convert NaN to empty string
|
| 30 |
+
.str.findall(pattern, flags=re.IGNORECASE)
|
| 31 |
+
.apply(lambda lst: "|".join(lst) if lst else "")
|
| 32 |
+
)
|
| 33 |
+
df["has_emergency"] = df["matched"].str.len() > 0
|
| 34 |
+
|
| 35 |
+
# Add metadata columns for future use
|
| 36 |
+
df["type"] = "emergency" # Document type identifier
|
| 37 |
+
df["condition"] = "" # Reserved for future condition mapping
|
| 38 |
+
|
| 39 |
+
# Calculate average matches
|
| 40 |
+
cnt_em = df["has_emergency"].sum()
|
| 41 |
+
avg_matches = (
|
| 42 |
+
df[df["has_emergency"]]["matched"]
|
| 43 |
+
.str.count(r"\|") # Escape the pipe
|
| 44 |
+
.add(1)
|
| 45 |
+
.mean()
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
print(f" Matched {cnt_em} emergency-related records")
|
| 49 |
+
print(f" Average keywords per record: {avg_matches:.2f}")
|
| 50 |
+
|
| 51 |
+
# Step 3: Save emergency subset
|
| 52 |
+
print("3️⃣ Saving emergency subset...")
|
| 53 |
+
out_dir = "../dataset/emergency"
|
| 54 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 55 |
+
subset = df[df["has_emergency"]]
|
| 56 |
+
subset.to_json(f"{out_dir}/emergency_subset.jsonl", orient="records", lines=True)
|
| 57 |
+
subset.to_csv(f"{out_dir}/emergency_subset.csv", index=False)
|
| 58 |
+
print(f"✅ Complete! Generated emergency subset with {len(subset)} records, saved in `{out_dir}`")
|
dataset/scripts/01_filter_emergency_opt.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
class MedicalTermProcessor:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
# Load emergency special terms from JSON
|
| 10 |
+
keywords_dir = Path("../keywords")
|
| 11 |
+
with open(keywords_dir / "special_terms_emergency.json", "r") as f:
|
| 12 |
+
self.emergency_terms_by_category = json.load(f)
|
| 13 |
+
|
| 14 |
+
# Flatten the nested structure for easy lookup
|
| 15 |
+
self.emergency_special_terms = {}
|
| 16 |
+
for category in self.emergency_terms_by_category.values():
|
| 17 |
+
self.emergency_special_terms.update(category)
|
| 18 |
+
|
| 19 |
+
def get_all_variants(self):
|
| 20 |
+
"""Get all term variants including special terms"""
|
| 21 |
+
variants = []
|
| 22 |
+
for term_list in self.emergency_special_terms.values():
|
| 23 |
+
variants.extend(term_list)
|
| 24 |
+
return variants
|
| 25 |
+
|
| 26 |
+
def standardize_term(self, term: str) -> str:
|
| 27 |
+
"""Convert a term to its standard form if it's a variant"""
|
| 28 |
+
term_lower = term.lower()
|
| 29 |
+
for standard_term, variants in self.emergency_special_terms.items():
|
| 30 |
+
if term_lower in [v.lower() for v in variants]:
|
| 31 |
+
return standard_term
|
| 32 |
+
return term
|
| 33 |
+
|
| 34 |
+
def process_matches(self, matches: list) -> str:
|
| 35 |
+
"""Process matches to standardize terms and remove duplicates"""
|
| 36 |
+
if not matches:
|
| 37 |
+
return ""
|
| 38 |
+
|
| 39 |
+
# Standardize terms
|
| 40 |
+
standardized = [self.standardize_term(match) for match in matches]
|
| 41 |
+
|
| 42 |
+
# Remove duplicates while preserving order
|
| 43 |
+
seen = set()
|
| 44 |
+
unique_matches = []
|
| 45 |
+
for term in standardized:
|
| 46 |
+
if term.lower() not in seen:
|
| 47 |
+
unique_matches.append(term)
|
| 48 |
+
seen.add(term.lower())
|
| 49 |
+
|
| 50 |
+
return "|".join(unique_matches)
|
| 51 |
+
|
| 52 |
+
# Function: Load keywords and print progress
|
| 53 |
+
def load_keywords(path, processor):
|
| 54 |
+
print(f"📥 Loading keywords from: {path}")
|
| 55 |
+
# Load basic keywords
|
| 56 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 57 |
+
basic_kws = [line.strip() for line in f if line.strip()]
|
| 58 |
+
|
| 59 |
+
# Add special term variants
|
| 60 |
+
special_kws = processor.get_all_variants()
|
| 61 |
+
all_kws = list(set(basic_kws + special_kws)) # Remove duplicates
|
| 62 |
+
|
| 63 |
+
print(f" Loaded {len(all_kws)} keywords (including variants)")
|
| 64 |
+
return all_kws
|
| 65 |
+
|
| 66 |
+
# Step 1: Read source data
|
| 67 |
+
print("1️⃣ Reading source data...")
|
| 68 |
+
source_path = "../dataset/guidelines_source_filtered.jsonl"
|
| 69 |
+
df = pd.read_json(source_path, lines=True)
|
| 70 |
+
print(f" Loaded {len(df)} records")
|
| 71 |
+
|
| 72 |
+
# Step 2: Load emergency keywords and match
|
| 73 |
+
print("2️⃣ Loading emergency keywords and matching...")
|
| 74 |
+
processor = MedicalTermProcessor()
|
| 75 |
+
keywords = load_keywords("../keywords/emergency_keywords.txt", processor)
|
| 76 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
|
| 77 |
+
|
| 78 |
+
# Match keywords and add metadata columns
|
| 79 |
+
df["matched"] = (
|
| 80 |
+
df["clean_text"]
|
| 81 |
+
.fillna("") # Convert NaN to empty string
|
| 82 |
+
.str.findall(pattern, flags=re.IGNORECASE)
|
| 83 |
+
.apply(lambda matches: processor.process_matches(matches)) # Use new process_matches method
|
| 84 |
+
)
|
| 85 |
+
df["has_emergency"] = df["matched"].str.len() > 0
|
| 86 |
+
|
| 87 |
+
# Add metadata columns for future use
|
| 88 |
+
df["type"] = "emergency" # Document type identifier
|
| 89 |
+
df["condition"] = "" # Reserved for future condition mapping
|
| 90 |
+
|
| 91 |
+
# Calculate average matches
|
| 92 |
+
cnt_em = df["has_emergency"].sum()
|
| 93 |
+
avg_matches = (
|
| 94 |
+
df[df["has_emergency"]]["matched"]
|
| 95 |
+
.str.count(r"\|") # Escape the pipe
|
| 96 |
+
.add(1)
|
| 97 |
+
.mean()
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
print(f" Matched {cnt_em} emergency-related records")
|
| 101 |
+
print(f" Average keywords per record: {avg_matches:.2f}")
|
| 102 |
+
|
| 103 |
+
# Step 3: Save emergency subset
|
| 104 |
+
print("3️⃣ Saving emergency subset...")
|
| 105 |
+
out_dir = "../dataset/emergency"
|
| 106 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 107 |
+
subset = df[df["has_emergency"]]
|
| 108 |
+
|
| 109 |
+
# Save with _opt suffix to distinguish from original files
|
| 110 |
+
subset.to_json(f"{out_dir}/emergency_subset_opt.jsonl", orient="records", lines=True)
|
| 111 |
+
subset.to_csv(f"{out_dir}/emergency_subset_opt.csv", index=False)
|
| 112 |
+
print(f"✅ Complete! Generated emergency subset with {len(subset)} records, saved in `{out_dir}` with _opt suffix")
|
dataset/scripts/02_filter_treatment.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scripts/02_filter_treatment.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
def preprocess_keywords(keywords_file):
|
| 8 |
+
"""Load and preprocess treatment keywords"""
|
| 9 |
+
print(f"📥 Loading keywords from: {keywords_file}")
|
| 10 |
+
|
| 11 |
+
# Special medical terms with common variants
|
| 12 |
+
special_terms = {
|
| 13 |
+
'x-ray': ['x-ray', 'x ray', 'xray'],
|
| 14 |
+
'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
|
| 15 |
+
'point-of-care': ['point-of-care', 'point of care']
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
# Read and preprocess keywords
|
| 19 |
+
with open(keywords_file, "r", encoding="utf-8") as f:
|
| 20 |
+
keywords = [line.strip().lower() for line in f if line.strip()]
|
| 21 |
+
|
| 22 |
+
# Process keywords and handle special terms
|
| 23 |
+
processed_keywords = []
|
| 24 |
+
for kw in keywords:
|
| 25 |
+
if kw in special_terms:
|
| 26 |
+
processed_keywords.extend(special_terms[kw])
|
| 27 |
+
else:
|
| 28 |
+
processed_keywords.append(kw)
|
| 29 |
+
|
| 30 |
+
print(f" Loaded {len(keywords)} base keywords")
|
| 31 |
+
print(f" Processed into {len(processed_keywords)} keyword variants")
|
| 32 |
+
return processed_keywords
|
| 33 |
+
|
| 34 |
+
def create_regex_pattern(keywords):
|
| 35 |
+
"""Create compiled regex pattern with word boundaries"""
|
| 36 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
|
| 37 |
+
return re.compile(pattern, re.IGNORECASE)
|
| 38 |
+
|
| 39 |
+
# Step 1: Read source data
|
| 40 |
+
print("1️⃣ Reading emergency subset...")
|
| 41 |
+
emergency_path = "../dataset/emergency/emergency_subset.jsonl"
|
| 42 |
+
df = pd.read_json(emergency_path, lines=True)
|
| 43 |
+
print(f" Loaded {len(df)} emergency records")
|
| 44 |
+
print(f" Contains emergency keywords in 'matched' column")
|
| 45 |
+
|
| 46 |
+
# Step 2: Load treatment keywords and match
|
| 47 |
+
print("2️⃣ Loading treatment keywords and matching...")
|
| 48 |
+
treatment_keywords = preprocess_keywords("../keywords/treatment_keywords.txt")
|
| 49 |
+
pattern = create_regex_pattern(treatment_keywords)
|
| 50 |
+
|
| 51 |
+
# Step 3: Process text and match keywords
|
| 52 |
+
print("3️⃣ Processing text and matching keywords...")
|
| 53 |
+
# Create lowercase version of text for matching
|
| 54 |
+
df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
|
| 55 |
+
|
| 56 |
+
# Match treatment keywords and add metadata columns
|
| 57 |
+
# Note: Preserving original 'matched' column from emergency subset
|
| 58 |
+
df["treatment_matched"] = (
|
| 59 |
+
df["clean_text_lower"]
|
| 60 |
+
.apply(lambda text: "|".join(pattern.findall(text)) or "")
|
| 61 |
+
)
|
| 62 |
+
df["has_treatment"] = df["treatment_matched"].str.len() > 0
|
| 63 |
+
|
| 64 |
+
# Add metadata columns for future use
|
| 65 |
+
df["type"] = "treatment" # Document type identifier
|
| 66 |
+
df["condition"] = "" # Reserved for future condition mapping
|
| 67 |
+
|
| 68 |
+
# Verify columns
|
| 69 |
+
print(" Verifying columns...")
|
| 70 |
+
print(f" - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
|
| 71 |
+
print(f" - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
|
| 72 |
+
|
| 73 |
+
# Calculate statistics
|
| 74 |
+
cnt_treat = df["has_treatment"].sum()
|
| 75 |
+
avg_matches = (
|
| 76 |
+
df[df["has_treatment"]]["treatment_matched"]
|
| 77 |
+
.str.count(r"\|")
|
| 78 |
+
.add(1)
|
| 79 |
+
.mean()
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
print(f" Found {cnt_treat} treatment-related records")
|
| 83 |
+
print(f" Average treatment keywords per record: {avg_matches:.2f}")
|
| 84 |
+
|
| 85 |
+
# Step 4: Save treatment subset
|
| 86 |
+
print("4️⃣ Saving treatment subset...")
|
| 87 |
+
out_dir = "../dataset/emergency_treatment"
|
| 88 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 89 |
+
|
| 90 |
+
# Select records with treatment keywords
|
| 91 |
+
subset = df[df["has_treatment"]].copy() # Use copy to avoid SettingWithCopyWarning
|
| 92 |
+
|
| 93 |
+
# Verify final subset columns
|
| 94 |
+
print(" Final subset columns:")
|
| 95 |
+
print(f" - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
|
| 96 |
+
print(f" - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
|
| 97 |
+
|
| 98 |
+
subset.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
|
| 99 |
+
subset.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
|
| 100 |
+
|
| 101 |
+
print(f"✅ Generated treatment subset with {len(subset)} records")
|
| 102 |
+
print(f" Saved in: {out_dir}")
|
| 103 |
+
print(f" Contains both emergency and treatment keywords")
|
dataset/scripts/02_filter_treatment_opt.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
class MedicalTermProcessor:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
# Load treatment special terms from JSON
|
| 10 |
+
keywords_dir = Path("../keywords")
|
| 11 |
+
with open(keywords_dir / "special_terms_treatment.json", "r") as f:
|
| 12 |
+
self.treatment_terms_by_category = json.load(f)
|
| 13 |
+
|
| 14 |
+
# Flatten the nested structure for easy lookup
|
| 15 |
+
self.treatment_special_terms = {}
|
| 16 |
+
for category in self.treatment_terms_by_category.values():
|
| 17 |
+
self.treatment_special_terms.update(category)
|
| 18 |
+
|
| 19 |
+
def get_all_variants(self):
|
| 20 |
+
"""Get all term variants including special terms"""
|
| 21 |
+
variants = []
|
| 22 |
+
for term_list in self.treatment_special_terms.values():
|
| 23 |
+
variants.extend(term_list)
|
| 24 |
+
return variants
|
| 25 |
+
|
| 26 |
+
def standardize_term(self, term: str) -> str:
|
| 27 |
+
"""Convert a term to its standard form if it's a variant"""
|
| 28 |
+
term_lower = term.lower()
|
| 29 |
+
for standard_term, variants in self.treatment_special_terms.items():
|
| 30 |
+
if term_lower in [v.lower() for v in variants]:
|
| 31 |
+
return standard_term
|
| 32 |
+
return term
|
| 33 |
+
|
| 34 |
+
def process_matches(self, matches: list) -> str:
|
| 35 |
+
"""Process matches to standardize terms and remove duplicates"""
|
| 36 |
+
if not matches:
|
| 37 |
+
return ""
|
| 38 |
+
|
| 39 |
+
# Standardize terms
|
| 40 |
+
standardized = [self.standardize_term(match) for match in matches]
|
| 41 |
+
|
| 42 |
+
# Remove duplicates while preserving order
|
| 43 |
+
seen = set()
|
| 44 |
+
unique_matches = []
|
| 45 |
+
for term in standardized:
|
| 46 |
+
if term.lower() not in seen:
|
| 47 |
+
unique_matches.append(term)
|
| 48 |
+
seen.add(term.lower())
|
| 49 |
+
|
| 50 |
+
return "|".join(unique_matches)
|
| 51 |
+
|
| 52 |
+
def load_keywords(path, processor):
|
| 53 |
+
"""Load and preprocess treatment keywords"""
|
| 54 |
+
print(f"📥 Loading keywords from: {path}")
|
| 55 |
+
|
| 56 |
+
# Load basic keywords
|
| 57 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 58 |
+
basic_kws = [line.strip() for line in f if line.strip()]
|
| 59 |
+
|
| 60 |
+
# Add special term variants
|
| 61 |
+
special_kws = processor.get_all_variants()
|
| 62 |
+
all_kws = list(set(basic_kws + special_kws)) # Remove duplicates
|
| 63 |
+
|
| 64 |
+
print(f" Loaded {len(all_kws)} keywords (including variants)")
|
| 65 |
+
return all_kws
|
| 66 |
+
|
| 67 |
+
# Step 1: Read optimized emergency subset
|
| 68 |
+
print("1️⃣ Reading optimized emergency subset...")
|
| 69 |
+
emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
|
| 70 |
+
df = pd.read_json(emergency_path, lines=True)
|
| 71 |
+
print(f" Loaded {len(df)} emergency records")
|
| 72 |
+
print(f" Contains emergency keywords in 'matched' column")
|
| 73 |
+
|
| 74 |
+
# Step 2: Load treatment keywords and match
|
| 75 |
+
print("2️⃣ Loading treatment keywords and matching...")
|
| 76 |
+
processor = MedicalTermProcessor()
|
| 77 |
+
keywords = load_keywords("../keywords/treatment_keywords.txt", processor)
|
| 78 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
|
| 79 |
+
|
| 80 |
+
# Step 3: Process text and match keywords
|
| 81 |
+
print("3️⃣ Processing text and matching keywords...")
|
| 82 |
+
# Match treatment keywords and add metadata columns
|
| 83 |
+
df["treatment_matched"] = (
|
| 84 |
+
df["clean_text"]
|
| 85 |
+
.fillna("") # Convert NaN to empty string
|
| 86 |
+
.str.findall(pattern, flags=re.IGNORECASE)
|
| 87 |
+
.apply(lambda matches: processor.process_matches(matches)) # Use new process_matches method
|
| 88 |
+
)
|
| 89 |
+
df["has_treatment"] = df["treatment_matched"].str.len() > 0
|
| 90 |
+
|
| 91 |
+
# Add metadata columns for future use
|
| 92 |
+
df["type"] = "treatment" # Document type identifier
|
| 93 |
+
df["condition"] = "" # Reserved for future condition mapping
|
| 94 |
+
|
| 95 |
+
# Verify columns
|
| 96 |
+
print(" Verifying columns...")
|
| 97 |
+
print(f" - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
|
| 98 |
+
print(f" - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
|
| 99 |
+
|
| 100 |
+
# Calculate statistics
|
| 101 |
+
cnt_treat = df["has_treatment"].sum()
|
| 102 |
+
avg_matches = (
|
| 103 |
+
df[df["has_treatment"]]["treatment_matched"]
|
| 104 |
+
.str.count(r"\|")
|
| 105 |
+
.add(1)
|
| 106 |
+
.mean()
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
print(f" Found {cnt_treat} treatment-related records")
|
| 110 |
+
print(f" Average treatment keywords per record: {avg_matches:.2f}")
|
| 111 |
+
|
| 112 |
+
# Step 4: Save treatment subset
|
| 113 |
+
print("4️⃣ Saving treatment subset...")
|
| 114 |
+
out_dir = "../dataset/emergency_treatment"
|
| 115 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 116 |
+
|
| 117 |
+
# Select records with treatment keywords
|
| 118 |
+
subset = df[df["has_treatment"]].copy() # Use copy to avoid SettingWithCopyWarning
|
| 119 |
+
|
| 120 |
+
# Verify final subset columns
|
| 121 |
+
print(" Final subset columns:")
|
| 122 |
+
print(f" - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
|
| 123 |
+
print(f" - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
|
| 124 |
+
|
| 125 |
+
# Save with _opt suffix
|
| 126 |
+
subset.to_json(f"{out_dir}/emergency_treatment_subset_opt.jsonl", orient="records", lines=True)
|
| 127 |
+
subset.to_csv(f"{out_dir}/emergency_treatment_subset_opt.csv", index=False)
|
| 128 |
+
|
| 129 |
+
print(f"✅ Generated optimized treatment subset with {len(subset)} records")
|
| 130 |
+
print(f" Saved in: {out_dir}")
|
| 131 |
+
print(f" Contains both emergency and treatment keywords")
|
dataset/scripts/check_subset_integrity.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# /scripts/check_subset_integrity.py
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
|
| 9 |
+
def check_subset_sample(file_path, sample_size=100):
|
| 10 |
+
"""
|
| 11 |
+
Check the first N rows of the subset file
|
| 12 |
+
"""
|
| 13 |
+
print(f"\n{'='*60}")
|
| 14 |
+
print(f"📊 Sampling Analysis (first {sample_size} rows)")
|
| 15 |
+
print(f"{'='*60}")
|
| 16 |
+
|
| 17 |
+
# Read sample
|
| 18 |
+
print(f"\n1️⃣ Reading sample from: {file_path}")
|
| 19 |
+
sample_df = pd.read_csv(file_path, nrows=sample_size)
|
| 20 |
+
|
| 21 |
+
# Basic information
|
| 22 |
+
print("\n2️⃣ Basic Information:")
|
| 23 |
+
print(f" Columns present: {', '.join(sample_df.columns.tolist())}")
|
| 24 |
+
|
| 25 |
+
# Check matched columns
|
| 26 |
+
print("\n3️⃣ Matched Columns Status:")
|
| 27 |
+
matched_stats = {
|
| 28 |
+
'matched': {
|
| 29 |
+
'non_null': int(sample_df['matched'].notna().sum()),
|
| 30 |
+
'non_empty': int((sample_df['matched'].str.len() > 0).sum()),
|
| 31 |
+
'unique_values': sample_df['matched'].nunique()
|
| 32 |
+
},
|
| 33 |
+
'treatment_matched': {
|
| 34 |
+
'non_null': int(sample_df['treatment_matched'].notna().sum()),
|
| 35 |
+
'non_empty': int((sample_df['treatment_matched'].str.len() > 0).sum()),
|
| 36 |
+
'unique_values': sample_df['treatment_matched'].nunique()
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
for col, stats in matched_stats.items():
|
| 41 |
+
print(f"\n {col}:")
|
| 42 |
+
print(f" - Non-null count: {stats['non_null']}/{sample_size}")
|
| 43 |
+
print(f" - Non-empty count: {stats['non_empty']}/{sample_size}")
|
| 44 |
+
print(f" - Unique values: {stats['unique_values']}")
|
| 45 |
+
|
| 46 |
+
# Sample rows with both matches
|
| 47 |
+
print("\n4️⃣ Sample Rows with Both Matches:")
|
| 48 |
+
both_matched = sample_df[
|
| 49 |
+
(sample_df['matched'].notna() & sample_df['matched'].str.len() > 0) &
|
| 50 |
+
(sample_df['treatment_matched'].notna() & sample_df['treatment_matched'].str.len() > 0)
|
| 51 |
+
].head(3)
|
| 52 |
+
|
| 53 |
+
for idx, row in both_matched.iterrows():
|
| 54 |
+
print(f"\n Row {idx}:")
|
| 55 |
+
print(f" - Emergency keywords: {row['matched']}")
|
| 56 |
+
print(f" - Treatment keywords: {row['treatment_matched']}")
|
| 57 |
+
|
| 58 |
+
return matched_stats
|
| 59 |
+
|
| 60 |
+
def analyze_large_file(file_path, chunk_size=1000):
|
| 61 |
+
"""
|
| 62 |
+
Analyze the entire file in chunks
|
| 63 |
+
"""
|
| 64 |
+
print(f"\n{'='*60}")
|
| 65 |
+
print(f"📈 Full File Analysis (chunk size: {chunk_size})")
|
| 66 |
+
print(f"{'='*60}")
|
| 67 |
+
|
| 68 |
+
stats = {
|
| 69 |
+
'total_rows': 0,
|
| 70 |
+
'matched_stats': {
|
| 71 |
+
'non_null': 0,
|
| 72 |
+
'non_empty': 0
|
| 73 |
+
},
|
| 74 |
+
'treatment_matched_stats': {
|
| 75 |
+
'non_null': 0,
|
| 76 |
+
'non_empty': 0
|
| 77 |
+
},
|
| 78 |
+
'both_matched': 0
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
print("\n1️⃣ Processing file in chunks...")
|
| 82 |
+
chunks = pd.read_csv(file_path, chunksize=chunk_size)
|
| 83 |
+
|
| 84 |
+
for chunk in tqdm(chunks, desc="Analyzing chunks"):
|
| 85 |
+
# Update total rows
|
| 86 |
+
stats['total_rows'] += len(chunk)
|
| 87 |
+
|
| 88 |
+
# Update matched stats
|
| 89 |
+
stats['matched_stats']['non_null'] += chunk['matched'].notna().sum()
|
| 90 |
+
stats['matched_stats']['non_empty'] += (chunk['matched'].str.len() > 0).sum()
|
| 91 |
+
|
| 92 |
+
# Update treatment_matched stats
|
| 93 |
+
stats['treatment_matched_stats']['non_null'] += chunk['treatment_matched'].notna().sum()
|
| 94 |
+
stats['treatment_matched_stats']['non_empty'] += (chunk['treatment_matched'].str.len() > 0).sum()
|
| 95 |
+
|
| 96 |
+
# Update both matched count
|
| 97 |
+
stats['both_matched'] += (
|
| 98 |
+
(chunk['matched'].notna() & chunk['matched'].str.len() > 0) &
|
| 99 |
+
(chunk['treatment_matched'].notna() & chunk['treatment_matched'].str.len() > 0)
|
| 100 |
+
).sum()
|
| 101 |
+
|
| 102 |
+
return stats
|
| 103 |
+
|
| 104 |
+
def generate_report(sample_stats, full_stats, output_dir):
|
| 105 |
+
"""
|
| 106 |
+
Generate and save analysis report
|
| 107 |
+
"""
|
| 108 |
+
print(f"\n{'='*60}")
|
| 109 |
+
print(f"📝 Generating Report")
|
| 110 |
+
print(f"{'='*60}")
|
| 111 |
+
|
| 112 |
+
report = {
|
| 113 |
+
'sample_analysis': sample_stats,
|
| 114 |
+
'full_file_analysis': {
|
| 115 |
+
'total_records': int(full_stats['total_rows']),
|
| 116 |
+
'matched_column': {
|
| 117 |
+
'non_null_count': int(full_stats['matched_stats']['non_null']),
|
| 118 |
+
'non_empty_count': int(full_stats['matched_stats']['non_empty']),
|
| 119 |
+
'null_percentage': float(
|
| 120 |
+
(full_stats['total_rows'] - full_stats['matched_stats']['non_null'])
|
| 121 |
+
/ full_stats['total_rows'] * 100
|
| 122 |
+
)
|
| 123 |
+
},
|
| 124 |
+
'treatment_matched_column': {
|
| 125 |
+
'non_null_count': int(full_stats['treatment_matched_stats']['non_null']),
|
| 126 |
+
'non_empty_count': int(full_stats['treatment_matched_stats']['non_empty']),
|
| 127 |
+
'null_percentage': float(
|
| 128 |
+
(full_stats['total_rows'] - full_stats['treatment_matched_stats']['non_null'])
|
| 129 |
+
/ full_stats['total_rows'] * 100
|
| 130 |
+
)
|
| 131 |
+
},
|
| 132 |
+
'both_matched_count': int(full_stats['both_matched']),
|
| 133 |
+
'both_matched_percentage': float(
|
| 134 |
+
full_stats['both_matched'] / full_stats['total_rows'] * 100
|
| 135 |
+
)
|
| 136 |
+
}
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
# Create output directory
|
| 140 |
+
output_dir = Path(output_dir)
|
| 141 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 142 |
+
|
| 143 |
+
# Save report
|
| 144 |
+
report_file = output_dir / 'integrity_check_report.json'
|
| 145 |
+
with open(report_file, 'w', encoding='utf-8') as f:
|
| 146 |
+
json.dump(report, f, indent=2, ensure_ascii=False)
|
| 147 |
+
|
| 148 |
+
print(f"\nReport saved to: {report_file}")
|
| 149 |
+
|
| 150 |
+
# Print summary
|
| 151 |
+
print("\n📊 Summary:")
|
| 152 |
+
print(f"Total records: {report['full_file_analysis']['total_records']}")
|
| 153 |
+
print(f"Records with both matches: {report['full_file_analysis']['both_matched_count']} "
|
| 154 |
+
f"({report['full_file_analysis']['both_matched_percentage']:.2f}%)")
|
| 155 |
+
|
| 156 |
+
return report
|
| 157 |
+
|
| 158 |
+
def main():
|
| 159 |
+
"""
|
| 160 |
+
Main execution function
|
| 161 |
+
"""
|
| 162 |
+
# Configuration
|
| 163 |
+
input_file = "../dataset/emergency_treatment/emergency_treatment_subset.csv"
|
| 164 |
+
output_dir = "../analysis/integrity_check"
|
| 165 |
+
|
| 166 |
+
print(f"\n🔍 Starting Subset Integrity Check")
|
| 167 |
+
print(f"Input file: {input_file}")
|
| 168 |
+
print(f"Output directory: {output_dir}")
|
| 169 |
+
|
| 170 |
+
# Run analysis
|
| 171 |
+
sample_stats = check_subset_sample(input_file)
|
| 172 |
+
full_stats = analyze_large_file(input_file)
|
| 173 |
+
report = generate_report(sample_stats, full_stats, output_dir)
|
| 174 |
+
|
| 175 |
+
print("\n✅ Integrity check complete!")
|
| 176 |
+
|
| 177 |
+
if __name__ == "__main__":
|
| 178 |
+
main()
|
dataset/scripts/commit_message_20250726_special_terms.txt
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
refactor: migrate special terms to JSON configuration
|
| 2 |
+
|
| 3 |
+
BREAKING CHANGE: Move hardcoded special terms mapping to external JSON files
|
| 4 |
+
|
| 5 |
+
1. Create New Configuration Files:
|
| 6 |
+
- Add special_terms_emergency.json
|
| 7 |
+
- Organize emergency terms by categories (cardiac, respiratory, etc.)
|
| 8 |
+
- Include all existing mappings with standardized structure
|
| 9 |
+
- Add special_terms_treatment.json
|
| 10 |
+
- Organize treatment terms by categories (imaging, medications, etc.)
|
| 11 |
+
- Maintain all existing term variants
|
| 12 |
+
|
| 13 |
+
2. Update Processing Scripts:
|
| 14 |
+
- Modify 01_filter_emergency_opt.py:
|
| 15 |
+
- Load terms from JSON configuration
|
| 16 |
+
- Add term standardization
|
| 17 |
+
- Implement deduplication
|
| 18 |
+
- Preserve category information
|
| 19 |
+
- Modify 02_filter_treatment_opt.py:
|
| 20 |
+
- Similar updates for treatment terms
|
| 21 |
+
- Maintain consistent processing logic
|
| 22 |
+
|
| 23 |
+
3. New Features:
|
| 24 |
+
- Term standardization: Convert variants to standard form
|
| 25 |
+
- Deduplication: Remove repeated terms while preserving order
|
| 26 |
+
- Category-aware: Support for term categorization
|
| 27 |
+
- Improved maintainability: Configuration separated from code
|
| 28 |
+
|
| 29 |
+
4. Technical Details:
|
| 30 |
+
- Use pathlib for file path handling
|
| 31 |
+
- JSON structure supports hierarchical organization
|
| 32 |
+
- Maintain backward compatibility
|
| 33 |
+
- Add type hints for better code clarity
|
| 34 |
+
|
| 35 |
+
Testing:
|
| 36 |
+
- Verify JSON format
|
| 37 |
+
- Confirm all mappings migrated correctly
|
| 38 |
+
- Check term standardization
|
| 39 |
+
- Validate deduplication logic
|
dataset/scripts/compare_subsets_opt.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /scripts/compare_subsets_opt.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
def load_and_compare_subsets(format_type='csv'):
|
| 7 |
+
"""
|
| 8 |
+
Load and compare the first 10 records from both optimized subsets
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
format_type (str): 'csv' or 'jsonl'
|
| 12 |
+
"""
|
| 13 |
+
# Prepare output file
|
| 14 |
+
output_dir = Path("../analysis")
|
| 15 |
+
output_dir.mkdir(exist_ok=True)
|
| 16 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 17 |
+
output_file = output_dir / f"subset_comparison_first10_records_{timestamp}.md"
|
| 18 |
+
|
| 19 |
+
# Initialize markdown content
|
| 20 |
+
md_content = []
|
| 21 |
+
md_content.append("# Optimized Subsets Comparison Report\n")
|
| 22 |
+
md_content.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 23 |
+
md_content.append(f"File format: {format_type.upper()}\n")
|
| 24 |
+
|
| 25 |
+
# Set file paths based on format
|
| 26 |
+
if format_type == 'csv':
|
| 27 |
+
emergency_path = "../dataset/emergency/emergency_subset_opt.csv"
|
| 28 |
+
treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
|
| 29 |
+
# Load CSV files
|
| 30 |
+
emergency_df = pd.read_csv(emergency_path)
|
| 31 |
+
treatment_df = pd.read_csv(treatment_path)
|
| 32 |
+
else: # jsonl
|
| 33 |
+
emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
|
| 34 |
+
treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.jsonl"
|
| 35 |
+
# Load JSONL files
|
| 36 |
+
emergency_df = pd.read_json(emergency_path, lines=True)
|
| 37 |
+
treatment_df = pd.read_json(treatment_path, lines=True)
|
| 38 |
+
|
| 39 |
+
# Print and save basic statistics
|
| 40 |
+
print("\n📊 Basic Statistics:")
|
| 41 |
+
print("-" * 40)
|
| 42 |
+
md_content.append("\n## Basic Statistics\n")
|
| 43 |
+
|
| 44 |
+
stats = [
|
| 45 |
+
f"- Emergency subset total records: {len(emergency_df)}",
|
| 46 |
+
f"- Emergency+Treatment subset total records: {len(treatment_df)}",
|
| 47 |
+
f"- Avg Emergency Text Length: {emergency_df['clean_text'].str.len().mean():.2f}",
|
| 48 |
+
f"- Avg Treatment Text Length: {treatment_df['clean_text'].str.len().mean():.2f}"
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
# Calculate average keywords using pattern
|
| 52 |
+
pattern = r'\|'
|
| 53 |
+
emergency_avg = emergency_df['matched'].str.count(pattern).add(1).mean()
|
| 54 |
+
treatment_avg = treatment_df['matched'].str.count(pattern).add(1).mean()
|
| 55 |
+
|
| 56 |
+
stats.extend([
|
| 57 |
+
f"- Avg Emergency Keywords: {emergency_avg:.2f}",
|
| 58 |
+
f"- Avg Treatment Keywords: {treatment_avg:.2f}"
|
| 59 |
+
])
|
| 60 |
+
|
| 61 |
+
# Print to console and add to markdown
|
| 62 |
+
for stat in stats:
|
| 63 |
+
print(stat.replace("- ", ""))
|
| 64 |
+
md_content.extend(stats)
|
| 65 |
+
|
| 66 |
+
# Compare first 10 records from Emergency subset
|
| 67 |
+
print("\n🔍 First 10 records from Emergency Subset:")
|
| 68 |
+
print("-" * 80)
|
| 69 |
+
md_content.append("\n## Emergency Subset (First 10 Records)\n")
|
| 70 |
+
|
| 71 |
+
for idx, row in emergency_df.head(10).iterrows():
|
| 72 |
+
print(f"\nRecord #{idx+1}")
|
| 73 |
+
print(f"Text preview: {row['clean_text'][:100]}...")
|
| 74 |
+
print(f"Matched keywords: {row['matched']}")
|
| 75 |
+
print(f"Text length: {len(row['clean_text'])}")
|
| 76 |
+
print("-" * 40)
|
| 77 |
+
|
| 78 |
+
md_content.extend([
|
| 79 |
+
f"\n### Record {idx+1}",
|
| 80 |
+
"```",
|
| 81 |
+
f"Text preview: {row['clean_text'][:100]}...",
|
| 82 |
+
f"Matched keywords: {row['matched']}",
|
| 83 |
+
f"Text length: {len(row['clean_text'])}",
|
| 84 |
+
"```\n"
|
| 85 |
+
])
|
| 86 |
+
|
| 87 |
+
# Compare first 10 records from Emergency+Treatment subset
|
| 88 |
+
print("\n🔍 First 10 records from Emergency+Treatment Subset:")
|
| 89 |
+
print("-" * 80)
|
| 90 |
+
md_content.append("\n## Emergency+Treatment Subset (First 10 Records)\n")
|
| 91 |
+
|
| 92 |
+
for idx, row in treatment_df.head(10).iterrows():
|
| 93 |
+
print(f"\nRecord #{idx+1}")
|
| 94 |
+
print(f"Text preview: {row['clean_text'][:100]}...")
|
| 95 |
+
print(f"Emergency keywords: {row['matched']}")
|
| 96 |
+
print(f"Treatment keywords: {row['treatment_matched']}")
|
| 97 |
+
print(f"Text length: {len(row['clean_text'])}")
|
| 98 |
+
print("-" * 40)
|
| 99 |
+
|
| 100 |
+
md_content.extend([
|
| 101 |
+
f"\n### Record {idx+1}",
|
| 102 |
+
"```",
|
| 103 |
+
f"Text preview: {row['clean_text'][:100]}...",
|
| 104 |
+
f"Emergency keywords: {row['matched']}",
|
| 105 |
+
f"Treatment keywords: {row['treatment_matched']}",
|
| 106 |
+
f"Text length: {len(row['clean_text'])}",
|
| 107 |
+
"```\n"
|
| 108 |
+
])
|
| 109 |
+
|
| 110 |
+
# Save markdown content
|
| 111 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 112 |
+
f.write('\n'.join(md_content))
|
| 113 |
+
|
| 114 |
+
print(f"\n✅ Comparison complete!")
|
| 115 |
+
print(f"Report saved to: {output_file}")
|
| 116 |
+
|
| 117 |
+
if __name__ == "__main__":
|
| 118 |
+
# Compare using CSV format
|
| 119 |
+
print("\nComparing CSV files...")
|
| 120 |
+
load_and_compare_subsets('csv')
|
| 121 |
+
|
| 122 |
+
# Compare using JSONL format
|
| 123 |
+
print("\nComparing JSONL files...")
|
| 124 |
+
load_and_compare_subsets('jsonl')
|
dataset/scripts/data_explorer.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /scripts/data_explorer.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
import numpy as np
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
def analyze_subset(file_path, keywords_path, output_dir="analysis"):
|
| 10 |
+
"""Analyze subset data quality and distribution"""
|
| 11 |
+
print(f"\n{'='*50}")
|
| 12 |
+
print(f"Starting dataset analysis: {file_path}")
|
| 13 |
+
print(f"Using keywords file: {keywords_path}")
|
| 14 |
+
print(f"Output directory: {output_dir}")
|
| 15 |
+
print(f"{'='*50}\n")
|
| 16 |
+
|
| 17 |
+
# Load data
|
| 18 |
+
print("1️⃣ Loading data...")
|
| 19 |
+
df = pd.read_csv(file_path)
|
| 20 |
+
output_dir = Path(output_dir)
|
| 21 |
+
|
| 22 |
+
# 1. Basic statistics
|
| 23 |
+
print("\n2️⃣ Calculating basic statistics...")
|
| 24 |
+
total = len(df)
|
| 25 |
+
df['text_length'] = df['clean_text'].str.len()
|
| 26 |
+
avg_len = df['text_length'].mean()
|
| 27 |
+
print(f"Total records: {total}")
|
| 28 |
+
print(f"Average text length: {avg_len:.2f}")
|
| 29 |
+
|
| 30 |
+
# Initialize statistics dictionary with native Python types
|
| 31 |
+
stats = {
|
| 32 |
+
'basic_statistics': {
|
| 33 |
+
'total_records': int(total),
|
| 34 |
+
'avg_length': float(avg_len)
|
| 35 |
+
},
|
| 36 |
+
'keyword_statistics': {}
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
# 2. Keyword analysis
|
| 40 |
+
print("\n3️⃣ Performing keyword analysis...")
|
| 41 |
+
with open(keywords_path, 'r') as f:
|
| 42 |
+
keywords = [line.strip() for line in f if line.strip()]
|
| 43 |
+
print(f"Loaded {len(keywords)} keywords")
|
| 44 |
+
|
| 45 |
+
# Count keywords and store in stats
|
| 46 |
+
for keyword in keywords:
|
| 47 |
+
cnt = df['clean_text'].str.contains(keyword, case=False).sum()
|
| 48 |
+
stats['keyword_statistics'][keyword] = int(cnt)
|
| 49 |
+
print(f" - {keyword}: {cnt} records")
|
| 50 |
+
|
| 51 |
+
# 3. Visualization
|
| 52 |
+
print("\n4️⃣ Generating visualizations...")
|
| 53 |
+
output_path = Path(output_dir) / "plots"
|
| 54 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 55 |
+
print(f"Charts will be saved in: {output_path}")
|
| 56 |
+
|
| 57 |
+
# 3.1 Keyword distribution chart
|
| 58 |
+
print(" - Generating keyword distribution chart...")
|
| 59 |
+
plt.figure(figsize=(15, 8))
|
| 60 |
+
plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
|
| 61 |
+
plt.xticks(rotation=45, ha='right')
|
| 62 |
+
# TODO: change the title to the name of the subset
|
| 63 |
+
plt.title('Keyword Distribution for Emergency Subset')
|
| 64 |
+
plt.xlabel('Keywords')
|
| 65 |
+
plt.ylabel('Match Count')
|
| 66 |
+
# TODO: change the name of the file to the name of the subset
|
| 67 |
+
plt.savefig(output_path / "keyword_distribution_emergency_subset.png", bbox_inches='tight')
|
| 68 |
+
plt.close()
|
| 69 |
+
|
| 70 |
+
# 3.2 Text length distribution
|
| 71 |
+
print(" - Generating text length distribution...")
|
| 72 |
+
plt.figure(figsize=(10, 6))
|
| 73 |
+
df['text_length'].hist(bins=50)
|
| 74 |
+
plt.title('Text Length Distribution')
|
| 75 |
+
plt.xlabel('Text Length')
|
| 76 |
+
plt.ylabel('Frequency')
|
| 77 |
+
# TODO: change the name of the file to the name of the subset
|
| 78 |
+
plt.savefig(output_path / "text_length_dist_emergency_subset.png", bbox_inches='tight')
|
| 79 |
+
plt.close()
|
| 80 |
+
|
| 81 |
+
# 3.3 Keyword co-occurrence analysis
|
| 82 |
+
print(" - Generating keyword co-occurrence heatmap...")
|
| 83 |
+
cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
|
| 84 |
+
for text in df['clean_text']:
|
| 85 |
+
present_keywords = [k for k in keywords if k.lower() in text.lower()]
|
| 86 |
+
for i, k1 in enumerate(present_keywords):
|
| 87 |
+
for j, k2 in enumerate(present_keywords):
|
| 88 |
+
if i != j:
|
| 89 |
+
cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
|
| 90 |
+
|
| 91 |
+
plt.figure(figsize=(12, 8))
|
| 92 |
+
sns.heatmap(cooccurrence_matrix,
|
| 93 |
+
xticklabels=keywords,
|
| 94 |
+
yticklabels=keywords,
|
| 95 |
+
cmap='YlOrRd')
|
| 96 |
+
plt.title('Keyword Co-occurrence Heatmap')
|
| 97 |
+
plt.xticks(rotation=45, ha='right')
|
| 98 |
+
plt.tight_layout()
|
| 99 |
+
# TODO: change the name of the file to the name of the subset
|
| 100 |
+
plt.savefig(output_path / "keyword_cooccurrence_emergency_subset.png", bbox_inches='tight')
|
| 101 |
+
plt.close()
|
| 102 |
+
|
| 103 |
+
# 4. Save statistics
|
| 104 |
+
print("\n5️⃣ Saving statistics...")
|
| 105 |
+
stats_path = Path(output_dir) / "stats"
|
| 106 |
+
stats_path.mkdir(parents=True, exist_ok=True)
|
| 107 |
+
# TODO: change the name of the file to the name of the subset
|
| 108 |
+
stats_file = stats_path / "analysis_stats_emergency_subset.json"
|
| 109 |
+
|
| 110 |
+
with open(stats_file, 'w', encoding='utf-8') as f:
|
| 111 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 112 |
+
print(f"Statistics saved to: {stats_file}")
|
| 113 |
+
|
| 114 |
+
print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")
|
| 115 |
+
|
| 116 |
+
if __name__ == "__main__":
|
| 117 |
+
# Set file paths
|
| 118 |
+
emergency_subset = "../dataset/emergency/emergency_subset.csv"
|
| 119 |
+
emergency_keywords = "../keywords/emergency_keywords.txt"
|
| 120 |
+
output_dir = "../analysis"
|
| 121 |
+
|
| 122 |
+
# Run analysis
|
| 123 |
+
analyze_subset(emergency_subset, emergency_keywords, output_dir)
|
dataset/scripts/data_explorer_opt.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /scripts/data_explorer_opt.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
import numpy as np
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
def analyze_subset(file_path, keywords_path, output_dir="analysis", subset_name="emergency"):
|
| 10 |
+
"""Analyze subset data quality and distribution"""
|
| 11 |
+
print(f"\n{'='*50}")
|
| 12 |
+
print(f"Starting optimized dataset analysis: {file_path}")
|
| 13 |
+
print(f"Using keywords file: {keywords_path}")
|
| 14 |
+
print(f"Output directory: {output_dir}")
|
| 15 |
+
print(f"{'='*50}\n")
|
| 16 |
+
|
| 17 |
+
# Load data
|
| 18 |
+
print("1️⃣ Loading data...")
|
| 19 |
+
df = pd.read_csv(file_path)
|
| 20 |
+
output_dir = Path(output_dir)
|
| 21 |
+
|
| 22 |
+
# 1. Basic statistics
|
| 23 |
+
print("\n2️⃣ Calculating basic statistics...")
|
| 24 |
+
total = len(df)
|
| 25 |
+
df['text_length'] = df['clean_text'].str.len()
|
| 26 |
+
avg_len = df['text_length'].mean()
|
| 27 |
+
print(f"Total records: {total}")
|
| 28 |
+
print(f"Average text length: {avg_len:.2f}")
|
| 29 |
+
|
| 30 |
+
# Initialize statistics dictionary with native Python types
|
| 31 |
+
stats = {
|
| 32 |
+
'basic_statistics': {
|
| 33 |
+
'total_records': int(total),
|
| 34 |
+
'avg_length': float(avg_len)
|
| 35 |
+
},
|
| 36 |
+
'keyword_statistics': {}
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
# 2. Keyword analysis
|
| 40 |
+
print("\n3️⃣ Performing keyword analysis...")
|
| 41 |
+
with open(keywords_path, 'r') as f:
|
| 42 |
+
keywords = [line.strip() for line in f if line.strip()]
|
| 43 |
+
print(f"Loaded {len(keywords)} keywords")
|
| 44 |
+
|
| 45 |
+
# Count keywords and store in stats
|
| 46 |
+
for keyword in keywords:
|
| 47 |
+
cnt = df['clean_text'].str.contains(keyword, case=False).sum()
|
| 48 |
+
stats['keyword_statistics'][keyword] = int(cnt)
|
| 49 |
+
print(f" - {keyword}: {cnt} records")
|
| 50 |
+
|
| 51 |
+
# 3. Visualization
|
| 52 |
+
print("\n4️⃣ Generating visualizations...")
|
| 53 |
+
output_path = Path(output_dir) / "plots"
|
| 54 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 55 |
+
print(f"Charts will be saved in: {output_path}")
|
| 56 |
+
|
| 57 |
+
# 3.1 Keyword distribution chart
|
| 58 |
+
print(" - Generating keyword distribution chart...")
|
| 59 |
+
plt.figure(figsize=(15, 8))
|
| 60 |
+
plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
|
| 61 |
+
plt.xticks(rotation=45, ha='right')
|
| 62 |
+
plt.title(f'Keyword Distribution for {subset_name.capitalize()} Subset (Optimized)')
|
| 63 |
+
plt.xlabel('Keywords')
|
| 64 |
+
plt.ylabel('Match Count')
|
| 65 |
+
plt.savefig(output_path / f"keyword_distribution_{subset_name}_subset_opt.png", bbox_inches='tight')
|
| 66 |
+
plt.close()
|
| 67 |
+
|
| 68 |
+
# 3.2 Text length distribution
|
| 69 |
+
print(" - Generating text length distribution...")
|
| 70 |
+
plt.figure(figsize=(10, 6))
|
| 71 |
+
df['text_length'].hist(bins=50)
|
| 72 |
+
plt.title(f'Text Length Distribution ({subset_name.capitalize()} Subset - Optimized)')
|
| 73 |
+
plt.xlabel('Text Length')
|
| 74 |
+
plt.ylabel('Frequency')
|
| 75 |
+
plt.savefig(output_path / f"text_length_dist_{subset_name}_subset_opt.png", bbox_inches='tight')
|
| 76 |
+
plt.close()
|
| 77 |
+
|
| 78 |
+
# 3.3 Keyword co-occurrence analysis
|
| 79 |
+
print(" - Generating keyword co-occurrence heatmap...")
|
| 80 |
+
cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
|
| 81 |
+
for text in df['clean_text']:
|
| 82 |
+
present_keywords = [k for k in keywords if k.lower() in text.lower()]
|
| 83 |
+
for i, k1 in enumerate(present_keywords):
|
| 84 |
+
for j, k2 in enumerate(present_keywords):
|
| 85 |
+
if i != j:
|
| 86 |
+
cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
|
| 87 |
+
|
| 88 |
+
plt.figure(figsize=(12, 8))
|
| 89 |
+
sns.heatmap(cooccurrence_matrix,
|
| 90 |
+
xticklabels=keywords,
|
| 91 |
+
yticklabels=keywords,
|
| 92 |
+
cmap='YlOrRd')
|
| 93 |
+
plt.title(f'Keyword Co-occurrence Heatmap ({subset_name.capitalize()} Subset - Optimized)')
|
| 94 |
+
plt.xticks(rotation=45, ha='right')
|
| 95 |
+
plt.tight_layout()
|
| 96 |
+
plt.savefig(output_path / f"keyword_cooccurrence_{subset_name}_subset_opt.png", bbox_inches='tight')
|
| 97 |
+
plt.close()
|
| 98 |
+
|
| 99 |
+
# 4. Save statistics
|
| 100 |
+
print("\n5️⃣ Saving statistics...")
|
| 101 |
+
stats_path = Path(output_dir) / "stats"
|
| 102 |
+
stats_path.mkdir(parents=True, exist_ok=True)
|
| 103 |
+
stats_file = stats_path / f"analysis_stats_{subset_name}_subset_opt.json"
|
| 104 |
+
|
| 105 |
+
with open(stats_file, 'w', encoding='utf-8') as f:
|
| 106 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 107 |
+
print(f"Statistics saved to: {stats_file}")
|
| 108 |
+
|
| 109 |
+
print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
# Set file paths for optimized version
|
| 113 |
+
emergency_subset = "../dataset/emergency/emergency_subset_opt.csv"
|
| 114 |
+
emergency_keywords = "../keywords/emergency_keywords.txt"
|
| 115 |
+
output_dir = "../analysis"
|
| 116 |
+
|
| 117 |
+
# Run analysis
|
| 118 |
+
analyze_subset(emergency_subset, emergency_keywords, output_dir, "emergency")
|
dataset/scripts/data_explorer_treatment.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /scripts/data_explorer_treatment.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
import numpy as np
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import json
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
def calculate_density(matches, text_length):
|
| 12 |
+
"""
|
| 13 |
+
Calculate keyword density per 1000 words
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
matches: Number of keyword matches
|
| 17 |
+
text_length: Total text length
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
float: Density per 1000 words
|
| 21 |
+
"""
|
| 22 |
+
return (matches / text_length) * 1000
|
| 23 |
+
|
| 24 |
+
def analyze_treatment_subset(
|
| 25 |
+
treatment_file_path,
|
| 26 |
+
emergency_keywords_path,
|
| 27 |
+
treatment_keywords_path,
|
| 28 |
+
output_dir="analysis_treatment"
|
| 29 |
+
):
|
| 30 |
+
"""
|
| 31 |
+
Specialized analysis for treatment subset focusing on:
|
| 32 |
+
1. Dual keyword analysis (emergency + treatment)
|
| 33 |
+
2. Path B effectiveness validation
|
| 34 |
+
3. Condition mapping data preparation
|
| 35 |
+
4. RAG readiness assessment
|
| 36 |
+
"""
|
| 37 |
+
print(f"\n{'='*60}")
|
| 38 |
+
print(f"Treatment Subset Analysis")
|
| 39 |
+
print(f"Treatment file: {treatment_file_path}")
|
| 40 |
+
print(f"Emergency keywords: {emergency_keywords_path}")
|
| 41 |
+
print(f"Treatment keywords: {treatment_keywords_path}")
|
| 42 |
+
print(f"Output directory: {output_dir}")
|
| 43 |
+
print(f"{'='*60}\n")
|
| 44 |
+
|
| 45 |
+
# Load data
|
| 46 |
+
print("1️⃣ Loading treatment subset data...")
|
| 47 |
+
df = pd.read_csv(treatment_file_path)
|
| 48 |
+
output_dir = Path(output_dir)
|
| 49 |
+
|
| 50 |
+
# Load keyword lists
|
| 51 |
+
print("2️⃣ Loading keyword lists...")
|
| 52 |
+
with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
|
| 53 |
+
emergency_keywords = [line.strip() for line in f if line.strip()]
|
| 54 |
+
|
| 55 |
+
with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
|
| 56 |
+
treatment_keywords = [line.strip() for line in f if line.strip()]
|
| 57 |
+
|
| 58 |
+
print(f" Emergency keywords: {len(emergency_keywords)}")
|
| 59 |
+
print(f" Treatment keywords: {len(treatment_keywords)}")
|
| 60 |
+
|
| 61 |
+
# Basic statistics
|
| 62 |
+
print("\n3️⃣ Computing basic statistics...")
|
| 63 |
+
total_records = len(df)
|
| 64 |
+
df['text_length'] = df['clean_text'].str.len()
|
| 65 |
+
avg_length = df['text_length'].mean()
|
| 66 |
+
|
| 67 |
+
print(f" Total treatment records: {total_records}")
|
| 68 |
+
print(f" Average text length: {avg_length:.2f} characters")
|
| 69 |
+
|
| 70 |
+
# Initialize comprehensive statistics
|
| 71 |
+
stats = {
|
| 72 |
+
'basic_statistics': {
|
| 73 |
+
'total_records': int(total_records),
|
| 74 |
+
'avg_text_length': float(avg_length),
|
| 75 |
+
'emergency_keywords_count': len(emergency_keywords),
|
| 76 |
+
'treatment_keywords_count': len(treatment_keywords)
|
| 77 |
+
},
|
| 78 |
+
'emergency_keyword_stats': {},
|
| 79 |
+
'treatment_keyword_stats': {},
|
| 80 |
+
'cooccurrence_analysis': {},
|
| 81 |
+
'path_b_validation': {},
|
| 82 |
+
'condition_mapping_candidates': {}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
# Emergency keyword analysis in treatment subset
|
| 86 |
+
print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
|
| 87 |
+
for keyword in emergency_keywords:
|
| 88 |
+
count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
|
| 89 |
+
stats['emergency_keyword_stats'][keyword] = int(count)
|
| 90 |
+
print(f" Emergency: {keyword} -> {count} records")
|
| 91 |
+
|
| 92 |
+
# Treatment keyword analysis
|
| 93 |
+
print("\n5️⃣ Analyzing treatment keywords...")
|
| 94 |
+
for keyword in treatment_keywords:
|
| 95 |
+
count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
|
| 96 |
+
stats['treatment_keyword_stats'][keyword] = int(count)
|
| 97 |
+
print(f" Treatment: {keyword} -> {count} records")
|
| 98 |
+
|
| 99 |
+
# Step 6: Co-occurrence analysis
|
| 100 |
+
print("\n6️⃣ Computing keyword co-occurrence patterns...")
|
| 101 |
+
|
| 102 |
+
# Initialize matrices for full dataset
|
| 103 |
+
emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
|
| 104 |
+
treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
|
| 105 |
+
|
| 106 |
+
# Pre-process text
|
| 107 |
+
print(" Pre-processing text...")
|
| 108 |
+
df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
|
| 109 |
+
|
| 110 |
+
# Process all emergency keywords
|
| 111 |
+
print("\n Processing all emergency keywords...")
|
| 112 |
+
for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
|
| 113 |
+
# Using word boundary instead of negative lookbehind/lookahead
|
| 114 |
+
pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
|
| 115 |
+
emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
| 116 |
+
matches = emergency_matrix[:, i].sum()
|
| 117 |
+
print(f" - {keyword}: {matches} matches")
|
| 118 |
+
|
| 119 |
+
# Process all treatment keywords
|
| 120 |
+
print("\n Processing all treatment keywords...")
|
| 121 |
+
for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
|
| 122 |
+
# Using word boundary instead of negative lookbehind/lookahead
|
| 123 |
+
pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
|
| 124 |
+
treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
| 125 |
+
matches = treatment_matrix[:, i].sum()
|
| 126 |
+
print(f" - {keyword}: {matches} matches")
|
| 127 |
+
|
| 128 |
+
# Compute co-occurrence matrix
|
| 129 |
+
print("\n Computing co-occurrence matrix...")
|
| 130 |
+
cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
|
| 131 |
+
print(" Computation completed successfully")
|
| 132 |
+
|
| 133 |
+
# Extract results
|
| 134 |
+
print(" Extracting co-occurrence pairs...")
|
| 135 |
+
cooccurrence_pairs = []
|
| 136 |
+
for i, em_kw in enumerate(emergency_keywords):
|
| 137 |
+
for j, tr_kw in enumerate(treatment_keywords):
|
| 138 |
+
count = int(cooc_matrix[i, j])
|
| 139 |
+
if count > 0:
|
| 140 |
+
cooccurrence_pairs.append({
|
| 141 |
+
'emergency_keyword': em_kw,
|
| 142 |
+
'treatment_keyword': tr_kw,
|
| 143 |
+
'cooccurrence_count': count,
|
| 144 |
+
'percentage': float(count / len(df) * 100)
|
| 145 |
+
})
|
| 146 |
+
|
| 147 |
+
# Sort and store results
|
| 148 |
+
cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
|
| 149 |
+
stats['cooccurrence_analysis'] = cooccurrence_pairs[:20] # Top 20 pairs
|
| 150 |
+
|
| 151 |
+
print(f" Found {len(cooccurrence_pairs)} co-occurrence pairs")
|
| 152 |
+
print(" Top 5 co-occurrence pairs:")
|
| 153 |
+
for i, pair in enumerate(cooccurrence_pairs[:5]):
|
| 154 |
+
print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
|
| 155 |
+
|
| 156 |
+
# Step 7: Path B validation metrics
|
| 157 |
+
print("\n7️⃣ Validating Path B strategy effectiveness...")
|
| 158 |
+
|
| 159 |
+
# Compute keyword density with progress bar
|
| 160 |
+
print(" Computing keyword density...")
|
| 161 |
+
with tqdm(total=2, desc="Density calculation") as pbar:
|
| 162 |
+
# Calculate density per 1000 words for both emergency and treatment keywords
|
| 163 |
+
emergency_density = calculate_density(
|
| 164 |
+
emergency_matrix.sum(axis=1),
|
| 165 |
+
df['text_length']
|
| 166 |
+
)
|
| 167 |
+
pbar.update(1)
|
| 168 |
+
|
| 169 |
+
treatment_density = calculate_density(
|
| 170 |
+
treatment_matrix.sum(axis=1),
|
| 171 |
+
df['text_length']
|
| 172 |
+
)
|
| 173 |
+
pbar.update(1)
|
| 174 |
+
|
| 175 |
+
# Store density in dataframe for visualization
|
| 176 |
+
df['emergency_keyword_density'] = emergency_density
|
| 177 |
+
df['treatment_keyword_density'] = treatment_density
|
| 178 |
+
|
| 179 |
+
# Calculate statistics with the new density metrics
|
| 180 |
+
stats['path_b_validation'] = {
|
| 181 |
+
'avg_emergency_density': float(np.mean(emergency_density)),
|
| 182 |
+
'avg_treatment_density': float(np.mean(treatment_density)),
|
| 183 |
+
'high_density_records': int(sum(
|
| 184 |
+
(emergency_density >= np.percentile(emergency_density, 75)) &
|
| 185 |
+
(treatment_density >= np.percentile(treatment_density, 75))
|
| 186 |
+
)),
|
| 187 |
+
'precision_estimate': float(sum(
|
| 188 |
+
(emergency_density > 0) & (treatment_density > 0)
|
| 189 |
+
) / len(df))
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
# Print detailed results
|
| 193 |
+
print("\n Results:")
|
| 194 |
+
print(f" - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
|
| 195 |
+
print(f" - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
|
| 196 |
+
print(f" - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
|
| 197 |
+
print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
|
| 198 |
+
|
| 199 |
+
# Sample distribution analysis
|
| 200 |
+
print("\n Density Distribution:")
|
| 201 |
+
density_counts = pd.DataFrame({
|
| 202 |
+
'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
|
| 203 |
+
'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
|
| 204 |
+
}).value_counts().head()
|
| 205 |
+
print(" Top 5 density combinations (emergency, treatment):")
|
| 206 |
+
for (em, tr), count in density_counts.items():
|
| 207 |
+
print(f" - {count} documents have {em} emergency and {tr} treatment density")
|
| 208 |
+
|
| 209 |
+
# Visualization
|
| 210 |
+
print("\n8️⃣ Generating visualizations...")
|
| 211 |
+
output_plots = output_dir / "plots"
|
| 212 |
+
output_plots.mkdir(parents=True, exist_ok=True)
|
| 213 |
+
|
| 214 |
+
# 1. Keyword density scatter plot with improved visualization
|
| 215 |
+
plt.figure(figsize=(12, 8))
|
| 216 |
+
plt.scatter(
|
| 217 |
+
emergency_density,
|
| 218 |
+
treatment_density,
|
| 219 |
+
alpha=0.6,
|
| 220 |
+
c=np.log1p(df['text_length']), # Color by log text length
|
| 221 |
+
cmap='viridis'
|
| 222 |
+
)
|
| 223 |
+
plt.colorbar(label='Log Text Length')
|
| 224 |
+
plt.xlabel('Emergency Keyword Density (per 1000 words)')
|
| 225 |
+
plt.ylabel('Treatment Keyword Density (per 1000 words)')
|
| 226 |
+
plt.title('Emergency vs Treatment Keyword Density')
|
| 227 |
+
plt.grid(True, alpha=0.3)
|
| 228 |
+
|
| 229 |
+
# Add mean lines
|
| 230 |
+
plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
|
| 231 |
+
plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
|
| 232 |
+
plt.legend()
|
| 233 |
+
|
| 234 |
+
plt.savefig(output_plots / "keyword_density_scatter.png", bbox_inches='tight', dpi=300)
|
| 235 |
+
plt.close()
|
| 236 |
+
|
| 237 |
+
# Save comprehensive statistics
|
| 238 |
+
print("\n9️⃣ Saving analysis results...")
|
| 239 |
+
stats_dir = output_dir / "stats"
|
| 240 |
+
stats_dir.mkdir(parents=True, exist_ok=True)
|
| 241 |
+
|
| 242 |
+
with open(stats_dir / "treatment_analysis_comprehensive.json", 'w', encoding='utf-8') as f:
|
| 243 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 244 |
+
|
| 245 |
+
print(f"✅ Treatment subset analysis complete!")
|
| 246 |
+
print(f" Results saved to: {output_dir}")
|
| 247 |
+
print(f" Plots: {output_plots}")
|
| 248 |
+
print(f" Statistics: {stats_dir}")
|
| 249 |
+
|
| 250 |
+
return stats
|
| 251 |
+
|
| 252 |
+
if __name__ == "__main__":
|
| 253 |
+
# Configuration
|
| 254 |
+
treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset.csv"
|
| 255 |
+
emergency_keywords = "../keywords/emergency_keywords.txt"
|
| 256 |
+
treatment_keywords = "../keywords/treatment_keywords.txt"
|
| 257 |
+
output_directory = "../analysis_treatment"
|
| 258 |
+
|
| 259 |
+
# Run analysis
|
| 260 |
+
results = analyze_treatment_subset(
|
| 261 |
+
treatment_file,
|
| 262 |
+
emergency_keywords,
|
| 263 |
+
treatment_keywords,
|
| 264 |
+
output_directory
|
| 265 |
+
)
|
dataset/scripts/data_explorer_treatment_opt.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /scripts/data_explorer_treatment_opt.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
import numpy as np
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import json
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
def calculate_density(matches, text_length):
|
| 12 |
+
"""
|
| 13 |
+
Calculate keyword density per 1000 words
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
matches: Number of keyword matches
|
| 17 |
+
text_length: Total text length
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
float: Density per 1000 words
|
| 21 |
+
"""
|
| 22 |
+
return (matches / text_length) * 1000
|
| 23 |
+
|
| 24 |
+
def analyze_treatment_subset(
|
| 25 |
+
treatment_file_path,
|
| 26 |
+
emergency_keywords_path,
|
| 27 |
+
treatment_keywords_path,
|
| 28 |
+
output_dir="analysis_treatment_opt" # Updated default output directory
|
| 29 |
+
):
|
| 30 |
+
"""
|
| 31 |
+
Specialized analysis for optimized treatment subset focusing on:
|
| 32 |
+
1. Dual keyword analysis (emergency + treatment)
|
| 33 |
+
2. Path B effectiveness validation
|
| 34 |
+
3. Condition mapping data preparation
|
| 35 |
+
4. RAG readiness assessment
|
| 36 |
+
"""
|
| 37 |
+
print(f"\n{'='*60}")
|
| 38 |
+
print(f"Treatment Subset Analysis (Optimized Version)")
|
| 39 |
+
print(f"Treatment file: {treatment_file_path}")
|
| 40 |
+
print(f"Emergency keywords: {emergency_keywords_path}")
|
| 41 |
+
print(f"Treatment keywords: {treatment_keywords_path}")
|
| 42 |
+
print(f"Output directory: {output_dir}")
|
| 43 |
+
print(f"{'='*60}\n")
|
| 44 |
+
|
| 45 |
+
# Load data
|
| 46 |
+
print("1️⃣ Loading optimized treatment subset data...")
|
| 47 |
+
df = pd.read_csv(treatment_file_path)
|
| 48 |
+
output_dir = Path(output_dir)
|
| 49 |
+
|
| 50 |
+
# Load keyword lists
|
| 51 |
+
print("2️⃣ Loading keyword lists...")
|
| 52 |
+
with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
|
| 53 |
+
emergency_keywords = [line.strip() for line in f if line.strip()]
|
| 54 |
+
|
| 55 |
+
with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
|
| 56 |
+
treatment_keywords = [line.strip() for line in f if line.strip()]
|
| 57 |
+
|
| 58 |
+
print(f" Emergency keywords: {len(emergency_keywords)}")
|
| 59 |
+
print(f" Treatment keywords: {len(treatment_keywords)}")
|
| 60 |
+
|
| 61 |
+
# Basic statistics
|
| 62 |
+
print("\n3️⃣ Computing basic statistics...")
|
| 63 |
+
total_records = len(df)
|
| 64 |
+
df['text_length'] = df['clean_text'].str.len()
|
| 65 |
+
avg_length = df['text_length'].mean()
|
| 66 |
+
|
| 67 |
+
print(f" Total treatment records: {total_records}")
|
| 68 |
+
print(f" Average text length: {avg_length:.2f} characters")
|
| 69 |
+
|
| 70 |
+
# Initialize comprehensive statistics
|
| 71 |
+
stats = {
|
| 72 |
+
'basic_statistics': {
|
| 73 |
+
'total_records': int(total_records),
|
| 74 |
+
'avg_text_length': float(avg_length),
|
| 75 |
+
'emergency_keywords_count': len(emergency_keywords),
|
| 76 |
+
'treatment_keywords_count': len(treatment_keywords)
|
| 77 |
+
},
|
| 78 |
+
'emergency_keyword_stats': {},
|
| 79 |
+
'treatment_keyword_stats': {},
|
| 80 |
+
'cooccurrence_analysis': {},
|
| 81 |
+
'path_b_validation': {},
|
| 82 |
+
'condition_mapping_candidates': {}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
# Emergency keyword analysis in treatment subset
|
| 86 |
+
print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
|
| 87 |
+
for keyword in emergency_keywords:
|
| 88 |
+
count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
|
| 89 |
+
stats['emergency_keyword_stats'][keyword] = int(count)
|
| 90 |
+
print(f" Emergency: {keyword} -> {count} records")
|
| 91 |
+
|
| 92 |
+
# Treatment keyword analysis
|
| 93 |
+
print("\n5️⃣ Analyzing treatment keywords...")
|
| 94 |
+
for keyword in treatment_keywords:
|
| 95 |
+
count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
|
| 96 |
+
stats['treatment_keyword_stats'][keyword] = int(count)
|
| 97 |
+
print(f" Treatment: {keyword} -> {count} records")
|
| 98 |
+
|
| 99 |
+
# Step 6: Co-occurrence analysis
|
| 100 |
+
print("\n6️⃣ Computing keyword co-occurrence patterns...")
|
| 101 |
+
|
| 102 |
+
# Initialize matrices for full dataset
|
| 103 |
+
emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
|
| 104 |
+
treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
|
| 105 |
+
|
| 106 |
+
# Pre-process text
|
| 107 |
+
print(" Pre-processing text...")
|
| 108 |
+
df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
|
| 109 |
+
|
| 110 |
+
# Process all emergency keywords
|
| 111 |
+
print("\n Processing all emergency keywords...")
|
| 112 |
+
for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
|
| 113 |
+
pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
|
| 114 |
+
emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
| 115 |
+
matches = emergency_matrix[:, i].sum()
|
| 116 |
+
print(f" - {keyword}: {matches} matches")
|
| 117 |
+
|
| 118 |
+
# Process all treatment keywords
|
| 119 |
+
print("\n Processing all treatment keywords...")
|
| 120 |
+
for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
|
| 121 |
+
pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
|
| 122 |
+
treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
| 123 |
+
matches = treatment_matrix[:, i].sum()
|
| 124 |
+
print(f" - {keyword}: {matches} matches")
|
| 125 |
+
|
| 126 |
+
# Compute co-occurrence matrix
|
| 127 |
+
print("\n Computing co-occurrence matrix...")
|
| 128 |
+
cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
|
| 129 |
+
print(" Computation completed successfully")
|
| 130 |
+
|
| 131 |
+
# Extract results
|
| 132 |
+
print(" Extracting co-occurrence pairs...")
|
| 133 |
+
cooccurrence_pairs = []
|
| 134 |
+
for i, em_kw in enumerate(emergency_keywords):
|
| 135 |
+
for j, tr_kw in enumerate(treatment_keywords):
|
| 136 |
+
count = int(cooc_matrix[i, j])
|
| 137 |
+
if count > 0:
|
| 138 |
+
cooccurrence_pairs.append({
|
| 139 |
+
'emergency_keyword': em_kw,
|
| 140 |
+
'treatment_keyword': tr_kw,
|
| 141 |
+
'cooccurrence_count': count,
|
| 142 |
+
'percentage': float(count / len(df) * 100)
|
| 143 |
+
})
|
| 144 |
+
|
| 145 |
+
# Sort and store results
|
| 146 |
+
cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
|
| 147 |
+
stats['cooccurrence_analysis'] = cooccurrence_pairs[:20] # Top 20 pairs
|
| 148 |
+
|
| 149 |
+
print(f" Found {len(cooccurrence_pairs)} co-occurrence pairs")
|
| 150 |
+
print(" Top 5 co-occurrence pairs:")
|
| 151 |
+
for i, pair in enumerate(cooccurrence_pairs[:5]):
|
| 152 |
+
print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
|
| 153 |
+
|
| 154 |
+
# Step 7: Path B validation metrics
|
| 155 |
+
print("\n7️⃣ Validating Path B strategy effectiveness...")
|
| 156 |
+
|
| 157 |
+
# Compute keyword density with progress bar
|
| 158 |
+
print(" Computing keyword density...")
|
| 159 |
+
with tqdm(total=2, desc="Density calculation") as pbar:
|
| 160 |
+
emergency_density = calculate_density(
|
| 161 |
+
emergency_matrix.sum(axis=1),
|
| 162 |
+
df['text_length']
|
| 163 |
+
)
|
| 164 |
+
pbar.update(1)
|
| 165 |
+
|
| 166 |
+
treatment_density = calculate_density(
|
| 167 |
+
treatment_matrix.sum(axis=1),
|
| 168 |
+
df['text_length']
|
| 169 |
+
)
|
| 170 |
+
pbar.update(1)
|
| 171 |
+
|
| 172 |
+
# Store density in dataframe for visualization
|
| 173 |
+
df['emergency_keyword_density'] = emergency_density
|
| 174 |
+
df['treatment_keyword_density'] = treatment_density
|
| 175 |
+
|
| 176 |
+
# Calculate statistics with the new density metrics
|
| 177 |
+
stats['path_b_validation'] = {
|
| 178 |
+
'avg_emergency_density': float(np.mean(emergency_density)),
|
| 179 |
+
'avg_treatment_density': float(np.mean(treatment_density)),
|
| 180 |
+
'high_density_records': int(sum(
|
| 181 |
+
(emergency_density >= np.percentile(emergency_density, 75)) &
|
| 182 |
+
(treatment_density >= np.percentile(treatment_density, 75))
|
| 183 |
+
)),
|
| 184 |
+
'precision_estimate': float(sum(
|
| 185 |
+
(emergency_density > 0) & (treatment_density > 0)
|
| 186 |
+
) / len(df))
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
# Print detailed results
|
| 190 |
+
print("\n Results:")
|
| 191 |
+
print(f" - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
|
| 192 |
+
print(f" - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
|
| 193 |
+
print(f" - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
|
| 194 |
+
print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
|
| 195 |
+
|
| 196 |
+
# Sample distribution analysis
|
| 197 |
+
print("\n Density Distribution:")
|
| 198 |
+
density_counts = pd.DataFrame({
|
| 199 |
+
'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
|
| 200 |
+
'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
|
| 201 |
+
}).value_counts().head()
|
| 202 |
+
print(" Top 5 density combinations (emergency, treatment):")
|
| 203 |
+
for (em, tr), count in density_counts.items():
|
| 204 |
+
print(f" - {count} documents have {em} emergency and {tr} treatment density")
|
| 205 |
+
|
| 206 |
+
# Visualization
|
| 207 |
+
print("\n8️⃣ Generating visualizations...")
|
| 208 |
+
output_plots = output_dir / "plots"
|
| 209 |
+
output_plots.mkdir(parents=True, exist_ok=True)
|
| 210 |
+
|
| 211 |
+
# 1. Keyword density scatter plot with improved visualization
|
| 212 |
+
plt.figure(figsize=(12, 8))
|
| 213 |
+
plt.scatter(
|
| 214 |
+
emergency_density,
|
| 215 |
+
treatment_density,
|
| 216 |
+
alpha=0.6,
|
| 217 |
+
c=np.log1p(df['text_length']),
|
| 218 |
+
cmap='viridis'
|
| 219 |
+
)
|
| 220 |
+
plt.colorbar(label='Log Text Length')
|
| 221 |
+
plt.xlabel('Emergency Keyword Density (per 1000 words)')
|
| 222 |
+
plt.ylabel('Treatment Keyword Density (per 1000 words)')
|
| 223 |
+
plt.title('Emergency vs Treatment Keyword Density (Optimized)')
|
| 224 |
+
plt.grid(True, alpha=0.3)
|
| 225 |
+
|
| 226 |
+
# Add mean lines
|
| 227 |
+
plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
|
| 228 |
+
plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
|
| 229 |
+
plt.legend()
|
| 230 |
+
|
| 231 |
+
plt.savefig(output_plots / "keyword_density_scatter_opt.png", bbox_inches='tight', dpi=300)
|
| 232 |
+
plt.close()
|
| 233 |
+
|
| 234 |
+
# Save comprehensive statistics
|
| 235 |
+
print("\n9️⃣ Saving analysis results...")
|
| 236 |
+
stats_dir = output_dir / "stats"
|
| 237 |
+
stats_dir.mkdir(parents=True, exist_ok=True)
|
| 238 |
+
|
| 239 |
+
with open(stats_dir / "treatment_analysis_comprehensive_opt.json", 'w', encoding='utf-8') as f:
|
| 240 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 241 |
+
|
| 242 |
+
print(f"✅ Treatment subset analysis complete! (Optimized Version)")
|
| 243 |
+
print(f" Results saved to: {output_dir}")
|
| 244 |
+
print(f" Plots: {output_plots}")
|
| 245 |
+
print(f" Statistics: {stats_dir}")
|
| 246 |
+
|
| 247 |
+
return stats
|
| 248 |
+
|
| 249 |
+
if __name__ == "__main__":
|
| 250 |
+
# Configuration for optimized version
|
| 251 |
+
treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
|
| 252 |
+
emergency_keywords = "../keywords/emergency_keywords.txt"
|
| 253 |
+
treatment_keywords = "../keywords/treatment_keywords.txt"
|
| 254 |
+
output_directory = "../analysis_treatment_opt"
|
| 255 |
+
|
| 256 |
+
# Run analysis
|
| 257 |
+
results = analyze_treatment_subset(
|
| 258 |
+
treatment_file,
|
| 259 |
+
emergency_keywords,
|
| 260 |
+
treatment_keywords,
|
| 261 |
+
output_directory
|
| 262 |
+
)
|
dataset/scripts/keyword_Match_Clean_for_subset_filter.txt
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Keyword Matching and Text Cleaning Logic for Subset Filtering
|
| 2 |
+
|
| 3 |
+
## 1. Keyword Preprocessing
|
| 4 |
+
```python
|
| 5 |
+
def preprocess_keywords(keywords_file):
|
| 6 |
+
# Handle special medical term variants
|
| 7 |
+
special_terms = {
|
| 8 |
+
'x-ray': ['x-ray', 'x ray', 'xray'],
|
| 9 |
+
'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
|
| 10 |
+
'point-of-care': ['point-of-care', 'point of care']
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
# Read and preprocess keywords
|
| 14 |
+
with open(keywords_file, "r", encoding="utf-8") as f:
|
| 15 |
+
keywords = [
|
| 16 |
+
line.strip() # Remove whitespace
|
| 17 |
+
.lower() # Convert to lowercase
|
| 18 |
+
for line in f
|
| 19 |
+
if line.strip()
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
# Process special term variants
|
| 23 |
+
processed_keywords = []
|
| 24 |
+
for kw in keywords:
|
| 25 |
+
if kw in special_terms:
|
| 26 |
+
processed_keywords.extend(special_terms[kw])
|
| 27 |
+
else:
|
| 28 |
+
processed_keywords.append(kw)
|
| 29 |
+
|
| 30 |
+
return processed_keywords
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## 2. Regex Pattern Processing
|
| 34 |
+
```python
|
| 35 |
+
def create_regex_pattern(keywords):
|
| 36 |
+
# Simple word boundary matching
|
| 37 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
|
| 38 |
+
return re.compile(pattern, re.IGNORECASE)
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
### Regex Pattern Explanation:
|
| 42 |
+
- `\b`: Word boundary matching
|
| 43 |
+
- `(?:...)`: Non-capturing group
|
| 44 |
+
- `re.escape()`: Escape special characters
|
| 45 |
+
- `re.IGNORECASE`: Case-insensitive matching
|
| 46 |
+
|
| 47 |
+
## 3. Text Preprocessing and Matching
|
| 48 |
+
```python
|
| 49 |
+
# Create lowercase version of text
|
| 50 |
+
df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
|
| 51 |
+
|
| 52 |
+
# Match keywords
|
| 53 |
+
df["treatment_matched"] = (
|
| 54 |
+
df["clean_text_lower"]
|
| 55 |
+
.apply(lambda text: "|".join(pattern.findall(text)) or "")
|
| 56 |
+
)
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## 4. Processing Logic Details
|
| 60 |
+
|
| 61 |
+
### 4.1 Special Term Handling Rationale
|
| 62 |
+
- Common variants in medical literature
|
| 63 |
+
- Maintain semantic consistency
|
| 64 |
+
- Improve matching accuracy
|
| 65 |
+
|
| 66 |
+
### 4.2 Regex Matching Strategy
|
| 67 |
+
- Word boundary matching for complete terms
|
| 68 |
+
- Precompiled patterns for performance
|
| 69 |
+
- Case-insensitive matching for flexibility
|
| 70 |
+
|
| 71 |
+
### 4.3 Text Preprocessing Steps
|
| 72 |
+
1. Fill null values (fillna)
|
| 73 |
+
2. Convert to lowercase (str.lower)
|
| 74 |
+
3. Create dedicated lowercase column to avoid repeated conversions
|
| 75 |
+
|
| 76 |
+
## 5. Output Format
|
| 77 |
+
- matched column: Pipe-separated matched keywords
|
| 78 |
+
- type column: Document type identifier ("emergency" or "treatment")
|
| 79 |
+
- condition column: Reserved for future condition mapping
|
| 80 |
+
|
| 81 |
+
## 6. Important Considerations
|
| 82 |
+
1. Regular maintenance required for special term variants
|
| 83 |
+
2. Precompiled regex patterns for performance optimization
|
| 84 |
+
3. Dedicated text preprocessing storage to avoid redundant computations
|
| 85 |
+
4. Maintain consistent column structure between emergency and treatment subsets
|
dataset/scripts/test_keyword_matching.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import re
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
def test_special_terms_matching():
|
| 7 |
+
"""
|
| 8 |
+
Test special medical term matching logic
|
| 9 |
+
"""
|
| 10 |
+
# Test cases for different scenarios
|
| 11 |
+
test_cases = {
|
| 12 |
+
"x-ray variants": [
|
| 13 |
+
"Patient needs an x-ray of the chest",
|
| 14 |
+
"Ordered chest xray",
|
| 15 |
+
"X ray shows pneumonia",
|
| 16 |
+
"XRAY negative"
|
| 17 |
+
],
|
| 18 |
+
"ct-scan variants": [
|
| 19 |
+
"CT scan reveals nodule",
|
| 20 |
+
"CT-scan indicates mass",
|
| 21 |
+
"Requires ctscan urgently",
|
| 22 |
+
"CTSCAN of abdomen"
|
| 23 |
+
],
|
| 24 |
+
"point-of-care variants": [
|
| 25 |
+
"Point-of-care testing needed",
|
| 26 |
+
"Point of care ultrasound",
|
| 27 |
+
"POC testing results"
|
| 28 |
+
],
|
| 29 |
+
"mixed cases": [
|
| 30 |
+
"Ordered both x-ray and CT scan",
|
| 31 |
+
"XRAY and CTSCAN negative",
|
| 32 |
+
"Multiple point-of-care tests with x-ray"
|
| 33 |
+
],
|
| 34 |
+
"negative cases": [
|
| 35 |
+
"No imaging mentioned",
|
| 36 |
+
"Regular examination only",
|
| 37 |
+
"Laboratory tests pending"
|
| 38 |
+
]
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# Special terms dictionary (from keyword_Match_Clean_for_subset_filter.txt)
|
| 42 |
+
special_terms = {
|
| 43 |
+
'x-ray': ['x-ray', 'x ray', 'xray'],
|
| 44 |
+
'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
|
| 45 |
+
'point-of-care': ['point-of-care', 'point of care']
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# Create test DataFrame
|
| 49 |
+
test_df = pd.DataFrame({
|
| 50 |
+
'clean_text': [text for cases in test_cases.values() for text in cases],
|
| 51 |
+
'category': [cat for cat, texts in test_cases.items() for _ in texts]
|
| 52 |
+
})
|
| 53 |
+
|
| 54 |
+
# Process keywords
|
| 55 |
+
processed_keywords = []
|
| 56 |
+
for term, variants in special_terms.items():
|
| 57 |
+
processed_keywords.extend(variants)
|
| 58 |
+
|
| 59 |
+
# Create regex pattern
|
| 60 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, processed_keywords)) + r")\b"
|
| 61 |
+
|
| 62 |
+
# Apply matching logic
|
| 63 |
+
test_df['matched'] = (
|
| 64 |
+
test_df['clean_text']
|
| 65 |
+
.fillna("")
|
| 66 |
+
.str.findall(pattern, flags=re.IGNORECASE)
|
| 67 |
+
.apply(lambda lst: "|".join(lst) if lst else "")
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
return test_df
|
| 71 |
+
|
| 72 |
+
def test_basic_matching():
|
| 73 |
+
"""
|
| 74 |
+
Test basic keyword matching functionality
|
| 75 |
+
"""
|
| 76 |
+
# Basic test cases
|
| 77 |
+
test_cases = {
|
| 78 |
+
"simple matches": [
|
| 79 |
+
"Emergency treatment required",
|
| 80 |
+
"Acute condition observed",
|
| 81 |
+
"Urgent care needed"
|
| 82 |
+
],
|
| 83 |
+
"case variations": [
|
| 84 |
+
"EMERGENCY situation",
|
| 85 |
+
"Acute RESPIRATORY failure",
|
| 86 |
+
"URgent surgical intervention"
|
| 87 |
+
],
|
| 88 |
+
"multiple matches": [
|
| 89 |
+
"Emergency treatment for acute condition",
|
| 90 |
+
"Urgent care in emergency department",
|
| 91 |
+
"Acute respiratory emergency"
|
| 92 |
+
],
|
| 93 |
+
"partial words": [
|
| 94 |
+
"Non-emergency situation",
|
| 95 |
+
"Subacute condition",
|
| 96 |
+
"Emergency-related"
|
| 97 |
+
]
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
# Create test DataFrame
|
| 101 |
+
test_df = pd.DataFrame({
|
| 102 |
+
'clean_text': [text for cases in test_cases.values() for text in cases],
|
| 103 |
+
'category': [cat for cat, texts in test_cases.items() for _ in texts]
|
| 104 |
+
})
|
| 105 |
+
|
| 106 |
+
# Test keywords
|
| 107 |
+
test_keywords = ['emergency', 'acute', 'urgent']
|
| 108 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, test_keywords)) + r")\b"
|
| 109 |
+
|
| 110 |
+
# Apply matching logic
|
| 111 |
+
test_df['matched'] = (
|
| 112 |
+
test_df['clean_text']
|
| 113 |
+
.fillna("")
|
| 114 |
+
.str.findall(pattern, flags=re.IGNORECASE)
|
| 115 |
+
.apply(lambda lst: "|".join(lst) if lst else "")
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
return test_df
|
| 119 |
+
|
| 120 |
+
def save_test_results(results_dict):
|
| 121 |
+
"""
|
| 122 |
+
Save test results to JSON file
|
| 123 |
+
"""
|
| 124 |
+
output_dir = Path("../analysis")
|
| 125 |
+
output_dir.mkdir(exist_ok=True)
|
| 126 |
+
|
| 127 |
+
output_file = output_dir / "keyword_matching_test_results.json"
|
| 128 |
+
|
| 129 |
+
# Convert DataFrame results to dictionary
|
| 130 |
+
for key, df in results_dict.items():
|
| 131 |
+
results_dict[key] = df.to_dict(orient='records')
|
| 132 |
+
|
| 133 |
+
with open(output_file, 'w') as f:
|
| 134 |
+
json.dump(results_dict, f, indent=2)
|
| 135 |
+
|
| 136 |
+
print(f"Results saved to: {output_file}")
|
| 137 |
+
|
| 138 |
+
def run_tests():
|
| 139 |
+
"""
|
| 140 |
+
Run all tests and output results
|
| 141 |
+
"""
|
| 142 |
+
print("🧪 Running keyword matching tests...")
|
| 143 |
+
|
| 144 |
+
# Run tests
|
| 145 |
+
special_terms_results = test_special_terms_matching()
|
| 146 |
+
basic_matching_results = test_basic_matching()
|
| 147 |
+
|
| 148 |
+
# Print results
|
| 149 |
+
print("\n📊 Special Terms Matching Results:")
|
| 150 |
+
for category in special_terms_results['category'].unique():
|
| 151 |
+
print(f"\n{category}:")
|
| 152 |
+
subset = special_terms_results[special_terms_results['category'] == category]
|
| 153 |
+
for _, row in subset.iterrows():
|
| 154 |
+
print(f"Text: {row['clean_text']}")
|
| 155 |
+
print(f"Matched: {row['matched'] or 'No matches'}")
|
| 156 |
+
print("-" * 50)
|
| 157 |
+
|
| 158 |
+
print("\n📊 Basic Matching Results:")
|
| 159 |
+
for category in basic_matching_results['category'].unique():
|
| 160 |
+
print(f"\n{category}:")
|
| 161 |
+
subset = basic_matching_results[basic_matching_results['category'] == category]
|
| 162 |
+
for _, row in subset.iterrows():
|
| 163 |
+
print(f"Text: {row['clean_text']}")
|
| 164 |
+
print(f"Matched: {row['matched'] or 'No matches'}")
|
| 165 |
+
print("-" * 50)
|
| 166 |
+
|
| 167 |
+
# Save results
|
| 168 |
+
results_dict = {
|
| 169 |
+
'special_terms_matching': special_terms_results,
|
| 170 |
+
'basic_matching': basic_matching_results
|
| 171 |
+
}
|
| 172 |
+
save_test_results(results_dict)
|
| 173 |
+
|
| 174 |
+
if __name__ == "__main__":
|
| 175 |
+
run_tests()
|
requirements.txt
CHANGED
|
@@ -10,12 +10,15 @@ Brotli==1.1.0
|
|
| 10 |
certifi==2025.7.14
|
| 11 |
charset-normalizer==3.4.2
|
| 12 |
click==8.2.1
|
|
|
|
|
|
|
| 13 |
datasets==4.0.0
|
| 14 |
dill==0.3.8
|
| 15 |
distro==1.9.0
|
| 16 |
fastapi==0.116.1
|
| 17 |
ffmpy==0.6.0
|
| 18 |
filelock==3.18.0
|
|
|
|
| 19 |
frozenlist==1.7.0
|
| 20 |
fsspec==2025.3.0
|
| 21 |
gradio==5.38.0
|
|
@@ -29,8 +32,10 @@ huggingface-hub==0.33.4
|
|
| 29 |
idna==3.10
|
| 30 |
Jinja2==3.1.6
|
| 31 |
jiter==0.10.0
|
|
|
|
| 32 |
markdown-it-py==3.0.0
|
| 33 |
MarkupSafe==3.0.2
|
|
|
|
| 34 |
mdurl==0.1.2
|
| 35 |
multidict==6.6.3
|
| 36 |
multiprocess==0.70.16
|
|
@@ -46,6 +51,7 @@ pydantic==2.11.7
|
|
| 46 |
pydantic_core==2.33.2
|
| 47 |
pydub==0.25.1
|
| 48 |
Pygments==2.19.2
|
|
|
|
| 49 |
python-dateutil==2.9.0.post0
|
| 50 |
python-multipart==0.0.20
|
| 51 |
pytz==2025.2
|
|
@@ -56,6 +62,7 @@ rich==14.0.0
|
|
| 56 |
ruff==0.12.4
|
| 57 |
safehttpx==0.1.6
|
| 58 |
safetensors==0.5.3
|
|
|
|
| 59 |
semantic-version==2.10.0
|
| 60 |
shellingham==1.5.4
|
| 61 |
six==1.17.0
|
|
|
|
| 10 |
certifi==2025.7.14
|
| 11 |
charset-normalizer==3.4.2
|
| 12 |
click==8.2.1
|
| 13 |
+
contourpy==1.3.2
|
| 14 |
+
cycler==0.12.1
|
| 15 |
datasets==4.0.0
|
| 16 |
dill==0.3.8
|
| 17 |
distro==1.9.0
|
| 18 |
fastapi==0.116.1
|
| 19 |
ffmpy==0.6.0
|
| 20 |
filelock==3.18.0
|
| 21 |
+
fonttools==4.59.0
|
| 22 |
frozenlist==1.7.0
|
| 23 |
fsspec==2025.3.0
|
| 24 |
gradio==5.38.0
|
|
|
|
| 32 |
idna==3.10
|
| 33 |
Jinja2==3.1.6
|
| 34 |
jiter==0.10.0
|
| 35 |
+
kiwisolver==1.4.8
|
| 36 |
markdown-it-py==3.0.0
|
| 37 |
MarkupSafe==3.0.2
|
| 38 |
+
matplotlib==3.10.3
|
| 39 |
mdurl==0.1.2
|
| 40 |
multidict==6.6.3
|
| 41 |
multiprocess==0.70.16
|
|
|
|
| 51 |
pydantic_core==2.33.2
|
| 52 |
pydub==0.25.1
|
| 53 |
Pygments==2.19.2
|
| 54 |
+
pyparsing==3.2.3
|
| 55 |
python-dateutil==2.9.0.post0
|
| 56 |
python-multipart==0.0.20
|
| 57 |
pytz==2025.2
|
|
|
|
| 62 |
ruff==0.12.4
|
| 63 |
safehttpx==0.1.6
|
| 64 |
safetensors==0.5.3
|
| 65 |
+
seaborn==0.13.2
|
| 66 |
semantic-version==2.10.0
|
| 67 |
shellingham==1.5.4
|
| 68 |
six==1.17.0
|