Spaces:

ybchen928
/

oncall-guide-ai

Sleeping

YanBoChen commited on Jul 31

Commit

f29be38

1 Parent(s): 32b1f74

Add end-to-end pipeline test script for OnCall.ai

- Implemented a comprehensive test suite to validate the entire workflow from user input to structured medical advice generation.
- Included realistic medical queries to simulate user interactions and confirm the pipeline's functionality.
- Integrated logging and detailed reporting for test results, including success rates and performance analysis.
- Added functionality to save test results in JSON format for further analysis.

Files changed (3) hide show

src/generation.py +519 -0
tests/result_of_test_end_to_end_pipeline.md +0 -0
tests/test_end_to_end_pipeline.py +473 -0

src/generation.py ADDED Viewed

	@@ -0,0 +1,519 @@

+"""
+OnCall.ai Medical Advice Generation Module
+This module handles:
+1. RAG prompt construction from retrieval results
+2. Medical advice generation using Med42-70B
+3. Response formatting and confidence assessment
+4. Integration with multi-dataset architecture
+Author: OnCall.ai Team
+Date: 2025-07-31
+"""
+import logging
+from typing import Dict, List, Optional, Any, Union
+from datetime import datetime
+import json
+# Import existing LLM client
+from llm_clients import llm_Med42_70BClient
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class MedicalAdviceGenerator:
+    """
+    Core generation module for medical advice using RAG approach
+    """
+    def __init__(self, llm_client: Optional[llm_Med42_70BClient] = None):
+        """
+        Initialize medical advice generator
+        Args:
+            llm_client: Optional Med42-70B client, creates new if None
+        """
+        self.llm_client = llm_client or llm_Med42_70BClient()
+        # Dataset source priorities for different intentions
+        self.dataset_priorities = {
+            "treatment": {
+                "emergency_subset": 2,
+                "treatment_subset": 4,
+                "symptom_subset": 0,      # Reserved for Dataset B
+                "diagnosis_subset": 0     # Reserved for Dataset B
+            },
+            "diagnosis": {
+                "emergency_subset": 4,
+                "treatment_subset": 2,
+                "symptom_subset": 0,      # Reserved for Dataset B
+                "diagnosis_subset": 0     # Reserved for Dataset B
+            },
+            # "STAT": {
+            #     # NOTE: Use when query contains urgent indicators like "NOW", "STAT", "critical"
+            #     "emergency_subset": 5,
+            #     "treatment_subset": 1,
+            #     "symptom_subset": 0,      # Reserved for Dataset B
+            #     "diagnosis_subset": 0     # Reserved for Dataset B
+            # }
+        }
+        logger.info("MedicalAdviceGenerator initialized")
+    def generate_medical_advice(self, user_query: str, retrieval_results: Dict[str, Any],
+                               intention: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Complete pipeline: construct prompt → generate advice → format response
+        Args:
+            user_query: Original user medical query
+            retrieval_results: Results from BasicRetrievalSystem.search()
+            intention: Optional query intention ('treatment', 'diagnosis', 'STAT'(tentative))
+        Returns:
+            Dict containing formatted medical advice and metadata
+        """
+        try:
+            logger.info(f"Generating medical advice for query: '{user_query[:50]}...'")
+            start_time = datetime.now()
+            # Step 1: Extract and classify chunks from retrieval results
+            classified_chunks = self._classify_retrieval_chunks(retrieval_results)
+            # Step 2: Build RAG prompt based on intention and chunk classification
+            rag_prompt = self.generate_prompt(user_query, classified_chunks, intention)
+            # Step 3: Generate medical advice using Med42-70B
+            generation_result = self._generate_with_med42(rag_prompt)
+            # Step 4: Format structured response
+            formatted_response = self._format_medical_response(
+                user_query=user_query,
+                generated_advice=generation_result,
+                chunks_used=classified_chunks,
+                intention=intention,
+                processing_time=(datetime.now() - start_time).total_seconds()
+            )
+            processing_duration = formatted_response.get('query_metadata', {}).get('processing_time_seconds', 0)
+            logger.info(f"Medical advice generated successfully in {processing_duration:.3f}s")
+            return formatted_response
+        except Exception as e:
+            logger.error(f"Medical advice generation failed: {e}")
+            return self._generate_error_response(user_query, str(e))
+    def generate_prompt(self, user_query: str, classified_chunks: Dict[str, List],
+                       intention: Optional[str] = None) -> str:
+        """
+        Enhanced prompt generator with flexible dataset integration
+        Args:
+            user_query: User's medical query
+            classified_chunks: Chunks classified by dataset source
+            intention: Query intention if detected
+        Returns:
+            Structured RAG prompt for Med42-70B
+        """
+        logger.info(f"Generating prompt with intention: {intention}")
+        # Extract chunks by dataset source
+        emergency_chunks = classified_chunks.get("emergency_subset", [])
+        treatment_chunks = classified_chunks.get("treatment_subset", [])
+        symptom_chunks = classified_chunks.get("symptom_subset", [])      # Dataset B (future)
+        diagnosis_chunks = classified_chunks.get("diagnosis_subset", [])  # Dataset B (future)
+        # Select chunks based on intention or intelligent defaults
+        selected_chunks = self._select_chunks_by_intention(
+            intention=intention,
+            emergency_chunks=emergency_chunks,
+            treatment_chunks=treatment_chunks,
+            symptom_chunks=symptom_chunks,
+            diagnosis_chunks=diagnosis_chunks
+        )
+        # Build context block from selected chunks
+        context_block = self._build_context_block(selected_chunks)
+        # Construct medical RAG prompt
+        prompt = self._construct_medical_prompt(user_query, context_block, intention)
+        logger.info(f"Generated prompt with {len(selected_chunks)} chunks, {len(context_block)} chars")
+        return prompt
+    def _classify_retrieval_chunks(self, retrieval_results: Dict[str, Any]) -> Dict[str, List]:
+        """
+        Classify retrieval chunks by dataset source
+        Args:
+            retrieval_results: Results from BasicRetrievalSystem.search()
+        Returns:
+            Dict mapping dataset sources to chunk lists
+        """
+        classified = {
+            "emergency_subset": [],
+            "treatment_subset": [],
+            "symptom_subset": [],      # Reserved for Dataset B
+            "diagnosis_subset": []     # Reserved for Dataset B
+        }
+        # Process results from current dual-index system
+        processed_results = retrieval_results.get('processed_results', [])
+        for chunk in processed_results:
+            chunk_type = chunk.get('type', 'unknown')
+            # Map current system types to dataset sources
+            if chunk_type == 'emergency':
+                classified["emergency_subset"].append(chunk)
+            elif chunk_type == 'treatment':
+                classified["treatment_subset"].append(chunk)
+            else:
+                # Unknown type, classify by content analysis or default to STAT (tentative)
+                logger.warning(f"Unknown chunk type: {chunk_type}, defaulting to STAT (tentative)")
+                classified["emergency_subset"].append(chunk)
+        # TODO: Future integration point for Dataset B
+        # When Dataset B team provides symptom/diagnosis data:
+        # classified["symptom_subset"] = process_dataset_b_symptoms(retrieval_results)
+        # classified["diagnosis_subset"] = process_dataset_b_diagnosis(retrieval_results)
+        logger.info(f"Classified chunks: Emergency={len(classified['emergency_subset'])}, "
+                   f"Treatment={len(classified['treatment_subset'])}")
+        return classified
+    def _select_chunks_by_intention(self, intention: Optional[str],
+                                   emergency_chunks: List, treatment_chunks: List,
+                                   symptom_chunks: List, diagnosis_chunks: List) -> List:
+        """
+        Select optimal chunk combination based on query intention
+        Args:
+            intention: Detected or specified intention
+            *_chunks: Chunks from different dataset sources
+        Returns:
+            List of selected chunks for prompt construction
+        """
+        if intention and intention in self.dataset_priorities:
+            # Use predefined priorities for known intentions
+            priorities = self.dataset_priorities[intention]
+            selected_chunks = []
+            # Add chunks according to priority allocation
+            selected_chunks.extend(emergency_chunks[:priorities["emergency_subset"]])
+            selected_chunks.extend(treatment_chunks[:priorities["treatment_subset"]])
+            # TODO: Future Dataset B integration
+            # selected_chunks.extend(symptom_chunks[:priorities["symptom_subset"]])
+            # selected_chunks.extend(diagnosis_chunks[:priorities["diagnosis_subset"]])
+            logger.info(f"Selected chunks by intention '{intention}': {len(selected_chunks)} total")
+        else:
+            # No specific intention - let LLM judge from best available chunks
+            all_chunks = emergency_chunks + treatment_chunks + symptom_chunks + diagnosis_chunks
+            # Sort by relevance (distance) and take top 6
+            all_chunks_sorted = sorted(all_chunks, key=lambda x: x.get("distance", 999))
+            selected_chunks = all_chunks_sorted[:6]
+            logger.info(f"Selected chunks by relevance (no intention): {len(selected_chunks)} total")
+        return selected_chunks
+    def _build_context_block(self, selected_chunks: List) -> str:
+        """
+        Build formatted context block from selected chunks
+        Args:
+            selected_chunks: List of selected chunks
+        Returns:
+            Formatted context string for prompt
+        """
+        if not selected_chunks:
+            return "No relevant medical guidelines found."
+        context_parts = []
+        for i, chunk in enumerate(selected_chunks, 1):
+            chunk_text = chunk.get("text", "").strip()
+            chunk_type = chunk.get("type", "unknown")
+            distance = chunk.get("distance", 0)
+            # Format each chunk with metadata
+            context_part = f"""
+[Guideline {i}] (Source: {chunk_type.title()}, Relevance: {1-distance:.3f})
+{chunk_text}
+            """.strip()
+            context_parts.append(context_part)
+        return "\n\n".join(context_parts)
+    def _construct_medical_prompt(self, user_query: str, context_block: str,
+                                 intention: Optional[str]) -> str:
+        """
+        Construct final medical RAG prompt with appropriate framing
+        Args:
+            user_query: Original user query
+            context_block: Formatted context from selected chunks
+            intention: Query intention if detected
+        Returns:
+            Complete RAG prompt for Med42-70B
+        """
+        # Customize prompt based on intention
+        if intention == "treatment":
+            focus_guidance = "Focus on providing specific treatment protocols, management steps, and therapeutic interventions."
+        elif intention == "diagnosis":
+            focus_guidance = "Focus on differential diagnosis, diagnostic criteria, and assessment approaches."
+        elif intention == "STAT(tentative)":
+            focus_guidance = "Focus on immediate emergency interventions and critical decision-making steps."
+        else:
+            focus_guidance = "Provide comprehensive medical guidance covering both diagnostic and treatment aspects as appropriate."
+        prompt = f"""You are an experienced attending physician providing guidance to a junior clinician in an emergency setting. A colleague is asking for your expert medical opinion.
+Clinical Question:
+{user_query}
+Relevant Medical Guidelines:
+{context_block}
+Instructions:
+{focus_guidance}
+Please provide a clear, actionable response that:
+1. Addresses the specific clinical question asked
+2. References relevant evidence from the provided guidelines
+3. Offers practical, step-by-step guidance when appropriate
+4. Maintains appropriate medical caution and emphasizes the need for clinical judgment
+Your response should be concise but comprehensive, suitable for immediate clinical application."""
+        return prompt
+    def _generate_with_med42(self, prompt: str) -> Dict[str, Any]:
+        """
+        Generate medical advice using Med42-70B
+        Args:
+            prompt: Complete RAG prompt
+        Returns:
+            Generation result with metadata
+        """
+        try:
+            logger.info("Calling Med42-70B for medical advice generation")
+            result = self.llm_client.analyze_medical_query(
+                query=prompt,
+                max_tokens=500,  # Adjust based on needs
+                timeout=30.0     # Allow more time for complex medical advice
+            )
+            if result.get('error'):
+                raise Exception(f"Med42-70B generation error: {result['error']}")
+            return result
+        except Exception as e:
+            logger.error(f"Med42-70B generation failed: {e}")
+            raise
+    def _format_medical_response(self, user_query: str, generated_advice: Dict[str, Any],
+                                chunks_used: Dict[str, List], intention: Optional[str],
+                                processing_time: float) -> Dict[str, Any]:
+        """
+        Format final medical response with metadata and confidence assessment
+        Args:
+            user_query: Original query
+            generated_advice: Result from Med42-70B
+            chunks_used: Classification of chunks used
+            intention: Detected intention
+            processing_time: Total processing time
+        Returns:
+            Structured medical advice response
+        """
+        # Extract generated content
+        advice_content = generated_advice.get('extracted_condition', '')
+        if not advice_content:
+            advice_content = generated_advice.get('raw_response', 'Unable to generate medical advice.')
+        # Calculate confidence based on available factors
+        confidence_score = self._calculate_confidence_score(generated_advice, chunks_used)
+        # Count chunks used by source
+        chunk_counts = {source: len(chunks) for source, chunks in chunks_used.items()}
+        total_chunks = sum(chunk_counts.values())
+        formatted_response = {
+            "medical_advice": advice_content,
+            "confidence_score": confidence_score,
+            "query_metadata": {
+                "original_query": user_query,
+                "detected_intention": intention,
+                "processing_time_seconds": processing_time,
+                "total_chunks_used": total_chunks,
+                "chunks_by_source": chunk_counts
+            },
+            "generation_metadata": {
+                "model_used": "m42-health/Llama3-Med42-70B",
+                "generation_time": generated_advice.get('latency', 0),
+                "model_confidence": generated_advice.get('confidence', 'unknown'),
+                "timestamp": datetime.now().isoformat()
+            },
+            "sources": {
+                "emergency_sources": len(chunks_used.get("emergency_subset", [])),
+                "treatment_sources": len(chunks_used.get("treatment_subset", [])),
+                "total_sources": total_chunks
+            },
+            "disclaimer": "This advice is for informational purposes only and should not replace professional medical consultation. Always consult with qualified healthcare providers for medical decisions."
+        }
+        return formatted_response
+    def _calculate_confidence_score(self, generated_advice: Dict[str, Any],
+                                   chunks_used: Dict[str, List]) -> float:
+        """
+        Calculate confidence score based on generation quality and source reliability
+        Args:
+            generated_advice: Result from Med42-70B
+            chunks_used: Chunks used in generation
+        Returns:
+            Confidence score between 0.0 and 1.0
+        """
+        confidence_factors = []
+        # Factor 1: Model confidence if available
+        model_confidence = generated_advice.get('confidence', '0.5')
+        try:
+            model_conf_value = float(model_confidence)
+            confidence_factors.append(model_conf_value)
+        except (ValueError, TypeError):
+            confidence_factors.append(0.5)  # Default neutral confidence
+        # Factor 2: Number of sources used (more sources = higher confidence)
+        total_chunks = sum(len(chunks) for chunks in chunks_used.values())
+        source_confidence = min(total_chunks / 6.0, 1.0)  # Normalize to max 6 chunks
+        confidence_factors.append(source_confidence)
+        # Factor 3: Response length (reasonable length indicates comprehensive advice)
+        response_length = len(generated_advice.get('raw_response', ''))
+        length_confidence = min(response_length / 500.0, 1.0)  # Normalize to ~500 chars
+        confidence_factors.append(length_confidence)
+        # Factor 4: Processing success (no errors = higher confidence)
+        if generated_advice.get('error'):
+            confidence_factors.append(0.3)  # Lower confidence if errors occurred
+        else:
+            confidence_factors.append(0.8)  # Higher confidence for clean generation
+        # Calculate weighted average
+        final_confidence = sum(confidence_factors) / len(confidence_factors)
+        # Ensure confidence is within valid range
+        return max(0.1, min(0.95, final_confidence))
+    def _generate_error_response(self, user_query: str, error_message: str) -> Dict[str, Any]:
+        """
+        Generate error response when generation fails
+        Args:
+            user_query: Original query
+            error_message: Error details
+        Returns:
+            Error response in standard format
+        """
+        return {
+            "medical_advice": "I apologize, but I encountered an error while processing your medical query. Please try rephrasing your question or contact technical support if the issue persists.",
+            "confidence_score": 0.0,
+            "query_metadata": {
+                "original_query": user_query,
+                "detected_intention": None,
+                "processing_time_seconds": 0.0,
+                "total_chunks_used": 0,
+                "chunks_by_source": {}
+            },
+            "generation_metadata": {
+                "model_used": "m42-health/Llama3-Med42-70B",
+                "error": error_message,
+                "timestamp": datetime.now().isoformat()
+            },
+            "sources": {
+                "emergency_sources": 0,
+                "treatment_sources": 0,
+                "total_sources": 0
+            },
+            "disclaimer": "This system experienced a technical error. Please consult with qualified healthcare providers for medical decisions."
+        }
+# Example usage and testing
+def main():
+    """
+    Test the medical advice generation system
+    """
+    # Initialize generator
+    generator = MedicalAdviceGenerator()
+    # Example retrieval results (simulated)
+    example_retrieval_results = {
+        "processed_results": [
+            {
+                "type": "emergency",
+                "distance": 0.3,
+                "text": "Acute myocardial infarction requires immediate assessment including ECG, cardiac enzymes, and chest X-ray. Time-sensitive condition requiring rapid intervention.",
+                "matched": "MI|chest pain"
+            },
+            {
+                "type": "treatment",
+                "distance": 0.25,
+                "text": "Treatment protocol for STEMI includes aspirin 325mg, clopidogrel loading dose, and urgent PCI within 90 minutes when available.",
+                "matched_treatment": "aspirin|PCI|thrombolytic"
+            }
+        ]
+    }
+    # Test queries
+    test_queries = [
+        ("How should I treat a patient with chest pain?", "treatment"),
+        ("What are the signs of acute MI?", "diagnosis"),
+        # ("Emergency management of cardiac arrest", "STAT(tentative)")
+    ]
+    for query, intention in test_queries:
+        print(f"\n{'='*60}")
+        print(f"Testing: {query}")
+        print(f"Intention: {intention}")
+        try:
+            result = generator.generate_medical_advice(
+                user_query=query,
+                retrieval_results=example_retrieval_results,
+                intention=intention
+            )
+            print(f"✅ Success: {result['confidence_score']:.2f} confidence")
+            print(f"Advice: {result['medical_advice'][:200]}...")
+        except Exception as e:
+            print(f"❌ Error: {e}")
+if __name__ == "__main__":
+    main()

tests/result_of_test_end_to_end_pipeline.md ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/test_end_to_end_pipeline.py ADDED Viewed

	@@ -0,0 +1,473 @@

+#!/usr/bin/env python3
+"""
+End-to-End Pipeline Script Test for OnCall.ai
+Tests the complete pipeline:
+User Input → UserPrompt Processing → Retrieval → Generation → Structured Medical Advice
+This script validates the entire workflow with realistic medical queries,
+simulating the user confirmation process and generating final medical advice.
+Author: OnCall.ai Team
+Date: 2025-07-31
+"""
+import sys
+import os
+from pathlib import Path
+import logging
+import json
+import traceback
+from datetime import datetime
+from typing import Dict, List, Any, Optional
+# Add src directory to Python path
+current_dir = Path(__file__).parent
+project_root = current_dir.parent
+src_dir = project_root / "src"
+sys.path.insert(0, str(src_dir))
+# Import all pipeline modules
+try:
+    from user_prompt import UserPromptProcessor
+    from retrieval import BasicRetrievalSystem
+    from llm_clients import llm_Med42_70BClient
+    from generation import MedicalAdviceGenerator
+    from medical_conditions import CONDITION_KEYWORD_MAPPING
+except ImportError as e:
+    print(f"❌ Import Error: {e}")
+    print(f"Current working directory: {os.getcwd()}")
+    print(f"Python path: {sys.path}")
+    sys.exit(1)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler(project_root / 'tests' / 'end_to_end_pipeline.log')
+    ]
+)
+logger = logging.getLogger(__name__)
+class EndToEndPipelineTest:
+    """Complete pipeline test with realistic medical scenarios"""
+    def __init__(self):
+        """Initialize test suite"""
+        self.start_time = datetime.now()
+        self.test_results = []
+        self.components_initialized = False
+        # Pipeline components
+        self.llm_client = None
+        self.retrieval_system = None
+        self.user_prompt_processor = None
+        self.medical_generator = None
+    def initialize_complete_pipeline(self):
+        """Initialize all pipeline components"""
+        print("🔧 Initializing Complete OnCall.ai Pipeline...")
+        print("-" * 60)
+        try:
+            # Initialize LLM client
+            print("1. Initializing Med42-70B Client...")
+            self.llm_client = llm_Med42_70BClient()
+            print("   ✅ Med42-70B client ready")
+            # Initialize retrieval system
+            print("2. Initializing Dual-Index Retrieval System...")
+            self.retrieval_system = BasicRetrievalSystem()
+            print("   ✅ Emergency & Treatment indices loaded")
+            # Initialize user prompt processor
+            print("3. Initializing Multi-Level Prompt Processor...")
+            self.user_prompt_processor = UserPromptProcessor(
+                llm_client=self.llm_client,
+                retrieval_system=self.retrieval_system
+            )
+            print("   ✅ Fallback validation system ready")
+            # Initialize medical advice generator
+            print("4. Initializing Medical Advice Generator...")
+            self.medical_generator = MedicalAdviceGenerator(
+                llm_client=self.llm_client
+            )
+            print("   ✅ RAG generation system ready")
+            self.components_initialized = True
+            print(f"\n🎉 Complete pipeline initialized successfully!")
+        except Exception as e:
+            logger.error(f"Pipeline initialization failed: {e}")
+            print(f"❌ Initialization failed: {e}")
+            traceback.print_exc()
+            self.components_initialized = False
+    def get_realistic_test_queries(self) -> List[Dict[str, Any]]:
+        """Define realistic medical queries for end-to-end testing"""
+        return [
+            {
+                "id": "e2e_001",
+                "query": "How to treat acute myocardial infarction in emergency department?",
+                "description": "Classic cardiac emergency with treatment focus",
+                "expected_intention": "treatment",
+                "category": "cardiac_emergency",
+                "simulated_confirmation": "yes"
+            },
+            {
+                "id": "e2e_002",
+                "query": "Patient presenting with severe chest pain and shortness of breath",
+                "description": "Symptom-based emergency requiring assessment and treatment",
+                "expected_intention": "diagnosis",
+                "category": "multi_symptom",
+                "simulated_confirmation": "yes"
+            },
+            {
+                "id": "e2e_003",
+                "query": "What are the emergency protocols for acute stroke management?",
+                "description": "Neurological emergency with protocol focus",
+                "expected_intention": "treatment",
+                "category": "neurological_emergency",
+                "simulated_confirmation": "yes"
+            },
+            {
+                "id": "e2e_004",
+                "query": "Differential diagnosis for sudden onset chest pain in young adult",
+                "description": "Diagnostic reasoning query",
+                "expected_intention": "diagnosis",
+                "category": "differential_diagnosis",
+                "simulated_confirmation": "yes"
+            },
+            {
+                "id": "e2e_005",
+                "query": "Emergency management of pulmonary embolism",
+                "description": "Pulmonary emergency requiring immediate intervention",
+                "expected_intention": "treatment",
+                "category": "pulmonary_emergency",
+                "simulated_confirmation": "yes"
+            },
+            {
+                "id": "e2e_006",
+                "query": "How to cook pasta properly?",
+                "description": "Non-medical query - should be rejected",
+                "expected_intention": None,
+                "category": "non_medical",
+                "simulated_confirmation": "reject_expected"
+            }
+        ]
+    def run_scripted_end_to_end_tests(self):
+        """Execute complete end-to-end tests with realistic queries"""
+        if not self.components_initialized:
+            print("❌ Cannot run tests: pipeline not initialized")
+            return
+        test_queries = self.get_realistic_test_queries()
+        print(f"\n🚀 Starting End-to-End Pipeline Tests")
+        print(f"Total test scenarios: {len(test_queries)}")
+        print(f"Test started at: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
+        print("=" * 80)
+        # Execute all tests
+        for test_case in test_queries:
+            result = self._execute_single_pipeline_test(test_case)
+            self.test_results.append(result)
+        # Generate comprehensive report
+        self._generate_end_to_end_report()
+        self._save_end_to_end_results()
+    def _execute_single_pipeline_test(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
+        """Execute single test through complete pipeline"""
+        test_id = test_case["id"]
+        query = test_case["query"]
+        print(f"\n🧪 {test_id}: {test_case['description']}")
+        print(f"Query: '{query}'")
+        print(f"Expected: {test_case['expected_intention']} intention")
+        print("-" * 70)
+        pipeline_start = datetime.now()
+        result = {
+            "test_id": test_id,
+            "test_case": test_case,
+            "timestamp": datetime.now().isoformat(),
+            "success": False,
+            "error": None,
+            "total_pipeline_time": 0,
+            "pipeline_steps": {}
+        }
+        try:
+            # STEP 1: User Prompt Processing
+            print("   🎯 Step 1: Condition extraction and validation...")
+            step1_start = datetime.now()
+            condition_result = self.user_prompt_processor.extract_condition_keywords(query)
+            step1_time = (datetime.now() - step1_start).total_seconds()
+            result["pipeline_steps"]["condition_extraction"] = {
+                "duration": step1_time,
+                "result": condition_result,
+                "condition_found": bool(condition_result.get('condition'))
+            }
+            print(f"      Condition: {condition_result.get('condition', 'None')}")
+            print(f"      Keywords: Emergency='{condition_result.get('emergency_keywords', 'None')}', Treatment='{condition_result.get('treatment_keywords', 'None')}'")
+            print(f"      Time: {step1_time:.3f}s")
+            # Check if this is a non-medical query that should be rejected
+            if condition_result.get('type') == 'invalid_query':
+                print("      🚫 Non-medical query correctly rejected")
+                result["pipeline_steps"]["rejection"] = {
+                    "reason": "non_medical_query",
+                    "message": condition_result.get('message', '')
+                }
+                result["success"] = test_case['category'] == 'non_medical'
+                return result
+            # STEP 2: User Confirmation (Simulated)
+            print("   🤝 Step 2: User confirmation (simulated as 'yes')...")
+            confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
+            result["pipeline_steps"]["confirmation"] = {
+                "type": confirmation.get('type', 'unknown'),
+                "simulated_response": test_case['simulated_confirmation']
+            }
+            if not condition_result.get('condition'):
+                print("      ⚠️  No condition extracted, skipping retrieval and generation")
+                result["pipeline_steps"]["pipeline_stopped"] = "no_condition"
+                return result
+            # STEP 3: Retrieval
+            print("   🔍 Step 3: Medical guideline retrieval...")
+            step3_start = datetime.now()
+            search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
+            if not search_query:
+                search_query = condition_result.get('condition', query)
+            retrieval_results = self.retrieval_system.search(search_query, top_k=5)
+            step3_time = (datetime.now() - step3_start).total_seconds()
+            processed_results = retrieval_results.get('processed_results', [])
+            emergency_count = len([r for r in processed_results if r.get('type') == 'emergency'])
+            treatment_count = len([r for r in processed_results if r.get('type') == 'treatment'])
+            result["pipeline_steps"]["retrieval"] = {
+                "duration": step3_time,
+                "search_query": search_query,
+                "total_results": len(processed_results),
+                "emergency_results": emergency_count,
+                "treatment_results": treatment_count
+            }
+            print(f"      Search Query: '{search_query}'")
+            print(f"      Results: {len(processed_results)} total ({emergency_count} emergency, {treatment_count} treatment)")
+            print(f"      Time: {step3_time:.3f}s")
+            # STEP 4: Medical Advice Generation
+            print("   🧠 Step 4: Medical advice generation...")
+            step4_start = datetime.now()
+            # Determine intention (simulate intelligent detection)
+            intention = test_case.get('expected_intention')
+            medical_advice = self.medical_generator.generate_medical_advice(
+                user_query=query,
+                retrieval_results=retrieval_results,
+                intention=intention
+            )
+            step4_time = (datetime.now() - step4_start).total_seconds()
+            result["pipeline_steps"]["generation"] = {
+                "duration": step4_time,
+                "intention_used": intention,
+                "confidence_score": medical_advice.get('confidence_score', 0.0),
+                "advice_length": len(medical_advice.get('medical_advice', '')),
+                "chunks_used": medical_advice.get('query_metadata', {}).get('total_chunks_used', 0)
+            }
+            print(f"      Intention: {intention}")
+            print(f"      Confidence: {medical_advice.get('confidence_score', 0.0):.2f}")
+            print(f"      Advice Length: {len(medical_advice.get('medical_advice', ''))} chars")
+            print(f"      Chunks Used: {medical_advice.get('query_metadata', {}).get('total_chunks_used', 0)}")
+            print(f"      Time: {step4_time:.3f}s")
+            # STEP 5: Results Summary
+            total_time = (datetime.now() - pipeline_start).total_seconds()
+            result["total_pipeline_time"] = total_time
+            result["final_medical_advice"] = medical_advice
+            result["success"] = True
+            print(f"\n   ✅ Pipeline completed successfully!")
+            print(f"   📊 Total Time: {total_time:.3f}s")
+            print(f"   🩺 Medical Advice Preview:")
+            print(f"      {medical_advice.get('medical_advice', 'No advice generated')[:150]}...")
+        except Exception as e:
+            total_time = (datetime.now() - pipeline_start).total_seconds()
+            result["total_pipeline_time"] = total_time
+            result["error"] = str(e)
+            result["traceback"] = traceback.format_exc()
+            logger.error(f"Pipeline test {test_id} failed: {e}")
+            print(f"   ❌ Pipeline failed: {e}")
+        return result
+    def _determine_extraction_source(self, condition_result: Dict) -> str:
+        """Determine how the condition was extracted"""
+        if condition_result.get('semantic_confidence') is not None:
+            return "semantic_search"
+        elif condition_result.get('generic_confidence') is not None:
+            return "generic_search"
+        elif condition_result.get('condition') in CONDITION_KEYWORD_MAPPING:
+            return "predefined_mapping"
+        else:
+            return "llm_extraction"
+    def _generate_end_to_end_report(self):
+        """Generate comprehensive end-to-end test report"""
+        end_time = datetime.now()
+        total_duration = (end_time - self.start_time).total_seconds()
+        successful_tests = [r for r in self.test_results if r['success']]
+        failed_tests = [r for r in self.test_results if not r['success']]
+        print("\n" + "=" * 80)
+        print("📊 END-TO-END PIPELINE TEST REPORT")
+        print("=" * 80)
+        # Overall Statistics
+        print(f"🕐 Execution Summary:")
+        print(f"   Test session duration: {total_duration:.3f}s")
+        print(f"   Average per test: {total_duration/len(self.test_results):.3f}s")
+        print(f"\n📈 Pipeline Results:")
+        print(f"   Total tests: {len(self.test_results)}")
+        print(f"   Successful: {len(successful_tests)} ✅")
+        print(f"   Failed: {len(failed_tests)} ❌")
+        print(f"   Success rate: {len(successful_tests)/len(self.test_results)*100:.1f}%")
+        # Performance Analysis
+        if successful_tests:
+            print(f"\n⚡ Performance Analysis:")
+            # Calculate average times for each step
+            step_times = {}
+            for result in successful_tests:
+                for step_name, step_data in result.get('pipeline_steps', {}).items():
+                    if 'duration' in step_data:
+                        if step_name not in step_times:
+                            step_times[step_name] = []
+                        step_times[step_name].append(step_data['duration'])
+            for step_name, times in step_times.items():
+                avg_time = sum(times) / len(times)
+                print(f"   {step_name.replace('_', ' ').title()}: {avg_time:.3f}s average")
+            # Overall pipeline performance
+            total_times = [r['total_pipeline_time'] for r in successful_tests]
+            avg_total = sum(total_times) / len(total_times)
+            print(f"   Complete Pipeline: {avg_total:.3f}s average")
+        # Detailed Results
+        print(f"\n📝 Detailed Test Results:")
+        for result in self.test_results:
+            test_case = result['test_case']
+            status = "✅ PASS" if result['success'] else "❌ FAIL"
+            print(f"\n   📋 {result['test_id']}: {status}")
+            print(f"      Query: '{test_case['query']}'")
+            print(f"      Category: {test_case['category']}")
+            print(f"      Total Time: {result['total_pipeline_time']:.3f}s")
+            if result['success']:
+                steps = result.get('pipeline_steps', {})
+                if 'condition_extraction' in steps:
+                    condition = steps['condition_extraction']['result'].get('condition', 'None')
+                    print(f"      Condition Extracted: {condition}")
+                if 'generation' in steps:
+                    confidence = steps['generation'].get('confidence_score', 0.0)
+                    chunks = steps['generation'].get('chunks_used', 0)
+                    print(f"      Generation: {confidence:.2f} confidence, {chunks} chunks")
+                if 'final_medical_advice' in result:
+                    advice = result['final_medical_advice'].get('medical_advice', '')
+                    print(f"      Advice Preview: {advice[:100]}...")
+            else:
+                if result.get('error'):
+                    print(f"      Error: {result['error']}")
+                elif 'rejection' in result.get('pipeline_steps', {}):
+                    print(f"      Rejected: {result['pipeline_steps']['rejection']['reason']}")
+        print("\n" + "=" * 80)
+    def _save_end_to_end_results(self):
+        """Save detailed test results to JSON file"""
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        filename = project_root / 'tests' / f'end_to_end_pipeline_results_{timestamp}.json'
+        try:
+            comprehensive_results = {
+                "test_metadata": {
+                    "test_type": "end_to_end_pipeline",
+                    "timestamp": datetime.now().isoformat(),
+                    "session_start": self.start_time.isoformat(),
+                    "total_duration_seconds": (datetime.now() - self.start_time).total_seconds(),
+                    "total_tests": len(self.test_results),
+                    "successful_tests": len([r for r in self.test_results if r['success']]),
+                    "failed_tests": len([r for r in self.test_results if not r['success']])
+                },
+                "pipeline_results": self.test_results,
+                "component_status": {
+                    "user_prompt_processor": "operational",
+                    "retrieval_system": "operational",
+                    "medical_generator": "operational",
+                    "med42_llm_client": "operational"
+                }
+            }
+            with open(filename, 'w', encoding='utf-8') as f:
+                json.dump(comprehensive_results, f, indent=2, ensure_ascii=False)
+            print(f"📁 End-to-end test results saved to: {filename}")
+        except Exception as e:
+            logger.error(f"Failed to save test results: {e}")
+            print(f"⚠️ Failed to save test results: {e}")
+def main():
+    """Main execution function"""
+    print("🏥 OnCall.ai Complete End-to-End Pipeline Test")
+    print("Testing: User Input → UserPrompt → Retrieval → Generation")
+    print("=" * 70)
+    # Initialize test suite
+    test_suite = EndToEndPipelineTest()
+    # Initialize complete pipeline
+    test_suite.initialize_complete_pipeline()
+    if not test_suite.components_initialized:
+        print("❌ Pipeline initialization failed. Cannot proceed with testing.")
+        return 1
+    # Run scripted end-to-end tests
+    test_suite.run_scripted_end_to_end_tests()
+    print(f"\n🎯 End-to-end testing completed!")
+    print("Next step: Create Gradio interface for interactive testing")
+    return 0
+if __name__ == "__main__":
+    exit_code = main()
+    sys.exit(exit_code)