Spaces:

ybchen928
/

oncall-guide-ai

Sleeping

YanBoChen commited on Jul 29

Commit

69b7911

1 Parent(s): c0317b2

refactor(data_processing): enhance chunking and embedding generation

BREAKING CHANGE: Switched to token-based chunking with metadata enrichment

Key Changes:
- Implemented token-based chunking strategy (256 tokens) with 64-token overlap
- Added embedding caching mechanism using MD5 hash for performance
- Enhanced metadata for treatment chunks (match_type, keyword presence)
- Simplified progress bars with consistent formatting
- Improved error handling and logging

Technical Details:
1. Chunking Improvements:
- Switched from character-based to token-based chunking
- Added dynamic token-to-char ratio calculation
- Implemented keyword-centered chunk generation
- Added chunk overlap for better context preservation

2. Embedding Optimization:
- Added MD5-based chunk caching
- Implemented batch processing (32 chunks per batch)
- Added progress tracking for embedding generation
- Optimized memory usage for large datasets

3. Metadata Enhancements:
- Added source_type tracking (emergency/treatment)
- Enhanced treatment chunks with keyword presence info
- Added match_type classification (both/emergency_only/treatment_only)
- Preserved original keyword metadata

4. Quality of Life:
- Improved progress bars with consistent formatting
- Enhanced logging with clear phase indicators
- Added comprehensive data validation
- Improved error messages and debugging info

Testing:
- Verified chunk generation with token limits
- Validated embedding dimensions (768)
- Confirmed metadata consistency
- Tested cache mechanism functionality

Migration Note:
Previous character-based chunks will need to be regenerated using the new
token-based approach. Run the full pipeline to update all embeddings.

Files changed (5) hide show

src/data_processing.py +71 -37
tests/test_chunk_quality_analysis.py +1 -1
tests/test_data_processing.py +99 -8
tests/test_embedding_and_index.py +1 -1
tests/test_embedding_validation.py +2 -2

src/data_processing.py CHANGED Viewed

@@ -106,7 +106,7 @@ class DataProcessor:
             raise FileNotFoundError(f"Treatment data not found: {treatment_path}")
         # Load data
-        self.emergency_data = pd.read_json(str(emergency_path), lines=True)  # 使用 str() 确保路径正确处理
         self.treatment_data = pd.read_json(str(treatment_path), lines=True)
         logger.info(f"Loaded {len(self.emergency_data)} emergency records")
@@ -167,11 +167,8 @@ class DataProcessor:
                 # Get the keyword text (already lowercase)
                 actual_keyword = text[keyword_pos:keyword_pos + len(keyword)]
-                # Calculate rough window size using dynamic ratio
-                # Cap the rough chunk target token size to prevent tokenizer warnings
-                # Use 512 tokens as target (model's max limit)
-                ROUGH_CHUNK_TARGET_TOKENS = 512
-                char_window = int(ROUGH_CHUNK_TARGET_TOKENS * chars_per_token / 2)
                 # Get rough chunk boundaries in characters
                 rough_start = max(0, keyword_pos - char_window)
@@ -235,7 +232,7 @@ class DataProcessor:
                                  doc_id: str = None) -> List[Dict[str, Any]]:
         """
         Create chunks for treatment data with both emergency and treatment keywords
-        使用token-based分離chunking策略，為treatment chunks添加預計算metadata
         Args:
             text: Input text
@@ -247,47 +244,79 @@ class DataProcessor:
         Returns:
             List of chunk dictionaries with enhanced metadata for treatment chunks
         """
-        if not treatment_keywords or pd.isna(treatment_keywords):
-            return []
         chunks = []
         chunk_size = chunk_size or self.chunk_size
-        # Parse keywords
-        em_kws = emergency_keywords.split('|') if emergency_keywords else []
-        tr_kws = treatment_keywords.split('|') if treatment_keywords else []
-        # Step 1: Process emergency keywords (保持原有格式)
         if emergency_keywords:
             em_chunks = self.create_keyword_centered_chunks(
-                text, emergency_keywords, chunk_size, doc_id
             )
-            # 標記為emergency chunks，保持原有metadata格式
             for chunk in em_chunks:
                 chunk['source_type'] = 'emergency'
             chunks.extend(em_chunks)
-        # Step 2: Process treatment keywords (添加新metadata)
         if treatment_keywords:
             tr_chunks = self.create_keyword_centered_chunks(
-                text, treatment_keywords, chunk_size, doc_id
             )
-            # 為每個treatment chunk添加預計算metadata
             for i, chunk in enumerate(tr_chunks):
                 chunk_text = chunk['text'].lower()
-                # 檢查文本包含的emergency關鍵字
                 contains_emergency_kws = [
                     kw for kw in em_kws if kw.lower() in chunk_text
                 ]
-                # 檢查文本包含的treatment關鍵字
                 contains_treatment_kws = [
                     kw for kw in tr_kws if kw.lower() in chunk_text
                 ]
-                # 確定匹配類型
                 has_emergency = len(contains_emergency_kws) > 0
                 has_treatment = len(contains_treatment_kws) > 0
@@ -300,20 +329,19 @@ class DataProcessor:
                 else:
                     match_type = "none"
-                # 添加預計算metadata (僅treatment chunks)
                 chunk.update({
                     'source_type': 'treatment',
                     'contains_emergency_kws': contains_emergency_kws,
                     'contains_treatment_kws': contains_treatment_kws,
                     'match_type': match_type,
-                    'emergency_keywords': emergency_keywords,  # 保存原始metadata
                     'treatment_keywords': treatment_keywords,
                     'chunk_id': f"{doc_id}_treatment_chunk_{i}" if doc_id else f"treatment_chunk_{i}"
                 })
             chunks.extend(tr_chunks)
-        logger.debug(f"Created {len(chunks)} dual-keyword chunks for document {doc_id or 'unknown'}")
         return chunks
     def process_emergency_chunks(self) -> List[Dict[str, Any]]:
@@ -323,12 +351,14 @@ class DataProcessor:
         all_chunks = []
-        # Add progress bar with leave=False to avoid cluttering
         for idx, row in tqdm(self.emergency_data.iterrows(),
                         total=len(self.emergency_data),
-                        desc="Processing emergency documents",
-                        unit="doc",
-                        leave=False):
             if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
                 chunks = self.create_keyword_centered_chunks(
                     text=row['clean_text'],
@@ -360,12 +390,14 @@ class DataProcessor:
         all_chunks = []
-        # Add progress bar with leave=False to avoid cluttering
         for idx, row in tqdm(self.treatment_data.iterrows(),
                         total=len(self.treatment_data),
-                        desc="Processing treatment documents",
-                        unit="doc",
-                        leave=False):
             if (pd.notna(row.get('clean_text')) and
                 pd.notna(row.get('treatment_matched'))):
@@ -469,10 +501,12 @@ class DataProcessor:
             logger.info(f"Processing {len(texts)} new {chunk_type} texts in {total_batches} batches...")
             for i in tqdm(range(0, len(texts), batch_size),
-                         desc=f"Embedding {chunk_type} subset",
                          total=total_batches,
-                         unit="batch",
-                         leave=False):
                 batch_texts = texts[i:i + batch_size]
                 batch_emb = model.encode(
                     batch_texts,

             raise FileNotFoundError(f"Treatment data not found: {treatment_path}")
         # Load data
+        self.emergency_data = pd.read_json(str(emergency_path), lines=True)  # use str() to ensure path is correct
         self.treatment_data = pd.read_json(str(treatment_path), lines=True)
         logger.info(f"Loaded {len(self.emergency_data)} emergency records")
                 # Get the keyword text (already lowercase)
                 actual_keyword = text[keyword_pos:keyword_pos + len(keyword)]
+                # Calculate rough window size using simple ratio
+                char_window = int(chunk_size * chars_per_token / 2)
                 # Get rough chunk boundaries in characters
                 rough_start = max(0, keyword_pos - char_window)
                                  doc_id: str = None) -> List[Dict[str, Any]]:
         """
         Create chunks for treatment data with both emergency and treatment keywords
+        using token-based separate chunking strategy with enhanced metadata for treatment chunks
         Args:
             text: Input text
         Returns:
             List of chunk dictionaries with enhanced metadata for treatment chunks
         """
         chunks = []
         chunk_size = chunk_size or self.chunk_size
+        # Case 1: No keywords present
+        if not emergency_keywords and not treatment_keywords:
+            return []
+        # Case 2: Only emergency keywords (early return)
+        if emergency_keywords and not treatment_keywords:
+            em_chunks = self.create_keyword_centered_chunks(
+                text=text,
+                matched_keywords=emergency_keywords,
+                chunk_size=chunk_size,
+                doc_id=doc_id
+            )
+            for chunk in em_chunks:
+                chunk['source_type'] = 'emergency'
+            return em_chunks
+        # Case 3: Only treatment keywords (early return)
+        if treatment_keywords and not emergency_keywords:
+            tr_chunks = self.create_keyword_centered_chunks(
+                text=text,
+                matched_keywords=treatment_keywords,
+                chunk_size=chunk_size,
+                doc_id=doc_id
+            )
+            for chunk in tr_chunks:
+                chunk['source_type'] = 'treatment'
+                chunk['contains_treatment_kws'] = treatment_keywords.split('|')
+                chunk['contains_emergency_kws'] = []
+                chunk['match_type'] = 'treatment_only'
+            return tr_chunks
+        # Case 4: Both keywords present - separate processing
+        # Process emergency keywords
         if emergency_keywords:
             em_chunks = self.create_keyword_centered_chunks(
+                text=text,
+                matched_keywords=emergency_keywords,
+                chunk_size=chunk_size,
+                doc_id=doc_id
             )
             for chunk in em_chunks:
                 chunk['source_type'] = 'emergency'
             chunks.extend(em_chunks)
+        # Process treatment keywords
         if treatment_keywords:
             tr_chunks = self.create_keyword_centered_chunks(
+                text=text,
+                matched_keywords=treatment_keywords,
+                chunk_size=chunk_size,
+                doc_id=doc_id
             )
+            # Parse keywords for metadata
+            em_kws = emergency_keywords.split('|') if emergency_keywords else []
+            tr_kws = treatment_keywords.split('|') if treatment_keywords else []
+            # Add metadata for each treatment chunk
             for i, chunk in enumerate(tr_chunks):
                 chunk_text = chunk['text'].lower()
+                # Check for keyword presence in chunk text
                 contains_emergency_kws = [
                     kw for kw in em_kws if kw.lower() in chunk_text
                 ]
                 contains_treatment_kws = [
                     kw for kw in tr_kws if kw.lower() in chunk_text
                 ]
+                # Determine match type based on keyword presence
                 has_emergency = len(contains_emergency_kws) > 0
                 has_treatment = len(contains_treatment_kws) > 0
                 else:
                     match_type = "none"
+                # Update chunk metadata
                 chunk.update({
                     'source_type': 'treatment',
                     'contains_emergency_kws': contains_emergency_kws,
                     'contains_treatment_kws': contains_treatment_kws,
                     'match_type': match_type,
+                    'emergency_keywords': emergency_keywords,  # Store original metadata
                     'treatment_keywords': treatment_keywords,
                     'chunk_id': f"{doc_id}_treatment_chunk_{i}" if doc_id else f"treatment_chunk_{i}"
                 })
             chunks.extend(tr_chunks)
         return chunks
     def process_emergency_chunks(self) -> List[Dict[str, Any]]:
         all_chunks = []
+        # Add simplified progress bar
         for idx, row in tqdm(self.emergency_data.iterrows(),
                         total=len(self.emergency_data),
+                        desc="Emergency Processing",
+                        unit="docs",
+                        leave=True,
+                        ncols=80,
+                        mininterval=1.0):
             if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
                 chunks = self.create_keyword_centered_chunks(
                     text=row['clean_text'],
         all_chunks = []
+        # Add simplified progress bar
         for idx, row in tqdm(self.treatment_data.iterrows(),
                         total=len(self.treatment_data),
+                        desc="Treatment Processing",
+                        unit="docs",
+                        leave=True,
+                        ncols=80,
+                        mininterval=1.0):
             if (pd.notna(row.get('clean_text')) and
                 pd.notna(row.get('treatment_matched'))):
             logger.info(f"Processing {len(texts)} new {chunk_type} texts in {total_batches} batches...")
             for i in tqdm(range(0, len(texts), batch_size),
+                         desc=f"Embedding {chunk_type}",
                          total=total_batches,
+                         unit="batches",
+                         leave=True,
+                         ncols=80,
+                         mininterval=0.5):
                 batch_texts = texts[i:i + batch_size]
                 batch_emb = model.encode(
                     batch_texts,

tests/test_chunk_quality_analysis.py CHANGED Viewed

@@ -27,7 +27,7 @@ current_dir = Path(__file__).parent.resolve()
 project_root = current_dir.parent
 sys.path.append(str(project_root / "src"))
-from data_processing import DataProcessor
 class TestChunkQualityAnalysis:

 project_root = current_dir.parent
 sys.path.append(str(project_root / "src"))
+from data_processing import DataProcessor #type: ignore
 class TestChunkQualityAnalysis:

tests/test_data_processing.py CHANGED Viewed

@@ -12,7 +12,7 @@ import pandas as pd
 # Add src to path
 sys.path.append(str(Path(__file__).parent.parent.resolve() / "src"))
-from data_processing import DataProcessor
 import logging
 # Setup logging
@@ -80,7 +80,7 @@ def test_chunking():
                 chunks = processor.create_keyword_centered_chunks(
                     text=row['clean_text'],
                     matched_keywords=row['matched'],
-                    chunk_size=512,
                     doc_id=str(row.get('id', idx))
                 )
                 emergency_chunks.extend(chunks)
@@ -97,7 +97,7 @@ def test_chunking():
                     text=row['clean_text'],
                     emergency_keywords=row.get('matched', ''),
                     treatment_keywords=row['treatment_matched'],
-                    chunk_size=512,
                     doc_id=str(row.get('id', idx))
                 )
                 treatment_chunks.extend(chunks)
@@ -116,7 +116,7 @@ def test_chunking():
             sample_chunk = treatment_chunks[0]
             print(f"\nSample treatment chunk:")
             print(f"  Primary keyword: {sample_chunk['primary_keyword']}")
-            print(f"  Emergency keywords: {sample_chunk['emergency_keywords']}")
             print(f"  Text length: {len(sample_chunk['text'])}")
             print(f"  Text preview: {sample_chunk['text'][:100]}...")
@@ -186,18 +186,109 @@ def test_token_chunking():
         print(f"❌ Token chunking test failed: {e}")
         return False
 def main():
     """Run all tests"""
     print("Starting data processing tests...\n")
-    # Import pandas here since it's used in chunking test
-    import pandas as pd
     tests = [
         test_data_loading,
         test_chunking,
         test_model_loading,
-        test_token_chunking  # Added new test
     ]
     results = []

 # Add src to path
 sys.path.append(str(Path(__file__).parent.parent.resolve() / "src"))
+from data_processing import DataProcessor #type: ignore
 import logging
 # Setup logging
                 chunks = processor.create_keyword_centered_chunks(
                     text=row['clean_text'],
                     matched_keywords=row['matched'],
+                    chunk_size=256,  # Updated to use 256 tokens
                     doc_id=str(row.get('id', idx))
                 )
                 emergency_chunks.extend(chunks)
                     text=row['clean_text'],
                     emergency_keywords=row.get('matched', ''),
                     treatment_keywords=row['treatment_matched'],
+                    chunk_size=256,  # Updated to use 256 tokens
                     doc_id=str(row.get('id', idx))
                 )
                 treatment_chunks.extend(chunks)
             sample_chunk = treatment_chunks[0]
             print(f"\nSample treatment chunk:")
             print(f"  Primary keyword: {sample_chunk['primary_keyword']}")
+            print(f"  Emergency keywords: {sample_chunk.get('emergency_keywords', '')}")
             print(f"  Text length: {len(sample_chunk['text'])}")
             print(f"  Text preview: {sample_chunk['text'][:100]}...")
         print(f"❌ Token chunking test failed: {e}")
         return False
+def test_dual_keyword_chunks():
+    """Test the enhanced dual keyword chunking functionality with token-based approach"""
+    print("\n" + "="*50)
+    print("TESTING DUAL KEYWORD CHUNKING")
+    print("="*50)
+    try:
+        processor = DataProcessor()
+        processor.load_embedding_model()  # Need tokenizer for token count verification
+        # Test case 1: Both emergency and treatment keywords
+        print("\nTest Case 1: Both Keywords")
+        text = "Patient with acute MI requires immediate IV treatment. Additional chest pain symptoms require aspirin administration."
+        emergency_kws = "MI|chest pain"
+        treatment_kws = "IV|aspirin"
+        chunks = processor.create_dual_keyword_chunks(
+            text=text,
+            emergency_keywords=emergency_kws,
+            treatment_keywords=treatment_kws,
+            chunk_size=256
+        )
+        # Verify chunk properties
+        for i, chunk in enumerate(chunks):
+            print(f"\nChunk {i+1}:")
+            # Verify source type
+            source_type = chunk.get('source_type')
+            assert source_type in ['emergency', 'treatment'], f"Invalid source_type: {source_type}"
+            print(f"• Source type: {source_type}")
+            # Verify metadata for treatment chunks
+            if source_type == 'treatment':
+                contains_em = chunk.get('contains_emergency_kws', [])
+                contains_tr = chunk.get('contains_treatment_kws', [])
+                match_type = chunk.get('match_type')
+                print(f"• Contains Emergency: {contains_em}")
+                print(f"• Contains Treatment: {contains_tr}")
+                print(f"• Match Type: {match_type}")
+                assert match_type in ['both', 'emergency_only', 'treatment_only', 'none'], \
+                    f"Invalid match_type: {match_type}"
+            # Verify token count
+            tokens = processor.tokenizer.tokenize(chunk['text'])
+            token_count = len(tokens)
+            print(f"• Token count: {token_count}")
+            # Allow for overlap
+            assert token_count <= 384, f"Chunk too large: {token_count} tokens"
+            # Print text preview
+            print(f"• Text preview: {chunk['text'][:100]}...")
+        # Test case 2: Emergency keywords only
+        print("\nTest Case 2: Emergency Only")
+        text = "Patient presents with severe chest pain and dyspnea."
+        emergency_kws = "chest pain"
+        treatment_kws = ""
+        chunks = processor.create_dual_keyword_chunks(
+            text=text,
+            emergency_keywords=emergency_kws,
+            treatment_keywords=treatment_kws,
+            chunk_size=256
+        )
+        assert len(chunks) > 0, "No chunks generated for emergency-only case"
+        print(f"✓ Generated {len(chunks)} chunks")
+        # Test case 3: Treatment keywords only
+        print("\nTest Case 3: Treatment Only")
+        text = "Administer IV fluids and monitor response."
+        emergency_kws = ""
+        treatment_kws = "IV"
+        chunks = processor.create_dual_keyword_chunks(
+            text=text,
+            emergency_keywords=emergency_kws,
+            treatment_keywords=treatment_kws,
+            chunk_size=256
+        )
+        assert len(chunks) > 0, "No chunks generated for treatment-only case"
+        print(f"✓ Generated {len(chunks)} chunks")
+        print("\n✅ All dual keyword chunking tests passed")
+        return True
+    except Exception as e:
+        print(f"\n❌ Dual keyword chunking test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
 def main():
     """Run all tests"""
     print("Starting data processing tests...\n")
     tests = [
         test_data_loading,
         test_chunking,
         test_model_loading,
+        test_token_chunking,
+        test_dual_keyword_chunks  # Added new test
     ]
     results = []

tests/test_embedding_and_index.py CHANGED Viewed

@@ -20,7 +20,7 @@ print(f"• Current directory: {current_dir}")
 print(f"• Project root: {project_root}")
 print(f"• Python path: {sys.path}")
-from data_processing import DataProcessor
 class TestEmbeddingAndIndex:

 print(f"• Project root: {project_root}")
 print(f"• Python path: {sys.path}")
+from data_processing import DataProcessor #type: ignore
 class TestEmbeddingAndIndex:

tests/test_embedding_validation.py CHANGED Viewed

@@ -45,7 +45,7 @@ class TestEmbeddingValidation:
         print(f"• Project root: {self.project_root}")
         print(f"• Models directory: {self.models_dir}")
         print(f"• Embeddings directory: {self.embeddings_dir}")
         self.logger.info(f"Project root: {self.project_root}")
         self.logger.info(f"Models directory: {self.models_dir}")
         self.logger.info(f"Embeddings directory: {self.embeddings_dir}")
@@ -277,7 +277,7 @@ def main():
     try:
         test.test_embedding_dimensions()
         test.test_multiple_known_item_search()
-        test.test_balanced_cross_dataset_search()
         print("\n" + "="*60)
         print("🎉 ALL EMBEDDING VALIDATION TESTS COMPLETED SUCCESSFULLY!")

         print(f"• Project root: {self.project_root}")
         print(f"• Models directory: {self.models_dir}")
         print(f"• Embeddings directory: {self.embeddings_dir}")
         self.logger.info(f"Project root: {self.project_root}")
         self.logger.info(f"Models directory: {self.models_dir}")
         self.logger.info(f"Embeddings directory: {self.embeddings_dir}")
     try:
         test.test_embedding_dimensions()
         test.test_multiple_known_item_search()
+        test.test_balanced_cross_dataset_search()
         print("\n" + "="*60)
         print("🎉 ALL EMBEDDING VALIDATION TESTS COMPLETED SUCCESSFULLY!")