Spaces:

ybchen928
/

oncall-guide-ai

Sleeping

YanBoChen commited on Aug 5

Commit

17613c8

1 Parent(s): 88e76fd

feat: Add Extraction, LLM Judge, and Relevance Chart Generators

- Implemented ExtractionChartGenerator to visualize extraction success rates from JSON statistics.
- Developed LLMJudgeEvaluator for batch evaluation of medical advice quality using Llama3-70B.
- Created RelevanceChartGenerator to generate charts for retrieval relevance metrics and cosine similarity analysis.
- Each generator includes methods for loading statistics, generating charts, and saving results.
- Added error handling and user-friendly messages for file operations and evaluations.

Files changed (5) hide show

evaluation/coverage_chart_generator.py +222 -0
evaluation/direct_llm_evaluator.py +401 -0
evaluation/extraction_chart_generator.py +216 -0
evaluation/llm_judge_evaluator.py +401 -0
evaluation/relevance_chart_generator.py +231 -0

evaluation/coverage_chart_generator.py ADDED Viewed

	@@ -0,0 +1,222 @@

+#!/usr/bin/env python3
+"""
+OnCall.ai System - Coverage Chart Generator
+===========================================
+Generates retrieval coverage charts from saved statistics.
+Shows how well generated advice utilizes retrieved content.
+Author: YanBo Chen
+Date: 2025-08-04
+"""
+import json
+import os
+import sys
+from typing import Dict, List, Any
+from datetime import datetime
+from pathlib import Path
+import glob
+# Visualization imports
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np
+class CoverageChartGenerator:
+    """Generate charts for retrieval coverage metrics"""
+    def __init__(self):
+        """Initialize chart generator"""
+        print("📈 Initializing Coverage Chart Generator...")
+        plt.style.use('default')
+        sns.set_palette("husl")
+        print("✅ Chart Generator ready")
+    def load_latest_coverage_statistics(self, results_dir: str = None) -> Dict[str, Any]:
+        """Load the most recent coverage statistics file"""
+        if results_dir is None:
+            results_dir = Path(__file__).parent / "results"
+        pattern = str(results_dir / "coverage_statistics_*.json")
+        stat_files = glob.glob(pattern)
+        if not stat_files:
+            raise FileNotFoundError(f"No coverage statistics files found in {results_dir}")
+        latest_file = max(stat_files, key=os.path.getmtime)
+        print(f"📊 Loading coverage statistics from: {latest_file}")
+        with open(latest_file, 'r', encoding='utf-8') as f:
+            stats = json.load(f)
+        return stats
+    def generate_coverage_charts(self, stats: Dict[str, Any]) -> str:
+        """Generate coverage analysis charts"""
+        try:
+            fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+            fig.suptitle('OnCall.ai Retrieval Coverage Analysis', fontsize=16, fontweight='bold')
+            category_results = stats['category_results']
+            overall_results = stats['overall_results']
+            # Chart 1: Average Coverage by Category
+            ax1 = axes[0, 0]
+            categories = []
+            avg_coverages = []
+            for category, cat_stats in category_results.items():
+                if cat_stats['successful_evaluations'] > 0:
+                    categories.append(category.replace('_', ' ').title())
+                    avg_coverages.append(cat_stats['average_coverage'] * 100)  # Convert to percentage
+            categories.append('Overall')
+            avg_coverages.append(overall_results['average_coverage'] * 100)
+            bars = ax1.bar(categories, avg_coverages, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
+            ax1.set_title('Average Coverage Score by Category', fontweight='bold')
+            ax1.set_ylabel('Coverage Score (%)')
+            ax1.set_xlabel('Query Category')
+            ax1.grid(True, alpha=0.3)
+            # Add target line
+            ax1.axhline(y=60, color='red', linestyle='--', alpha=0.7, label='60% Target')
+            ax1.legend()
+            # Add value labels
+            for bar, coverage in zip(bars, avg_coverages):
+                height = bar.get_height()
+                ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
+                        f'{coverage:.1f}%', ha='center', va='bottom', fontweight='bold')
+            # Chart 2: Coverage Distribution
+            ax2 = axes[0, 1]
+            # Collect all individual coverage scores
+            all_scores = []
+            for category, cat_stats in category_results.items():
+                if cat_stats.get('individual_coverage_scores'):
+                    all_scores.extend([score * 100 for score in cat_stats['individual_coverage_scores']])
+            if all_scores:
+                # Create histogram
+                ax2.hist(all_scores, bins=15, alpha=0.7, color='lightcoral', edgecolor='black')
+                ax2.axvline(x=60, color='red', linestyle='--', alpha=0.7, label='60% Target')
+                ax2.axvline(x=np.mean(all_scores), color='green', linestyle='-', alpha=0.8, label=f'Mean: {np.mean(all_scores):.1f}%')
+                ax2.set_title('Coverage Score Distribution', fontweight='bold')
+                ax2.set_xlabel('Coverage Score (%)')
+                ax2.set_ylabel('Frequency')
+                ax2.legend()
+                ax2.grid(True, alpha=0.3)
+            else:
+                ax2.text(0.5, 0.5, 'No coverage data available', ha='center', va='center', transform=ax2.transAxes)
+                ax2.set_title('Coverage Score Distribution', fontweight='bold')
+            # Chart 3: Statistical Summary Table
+            ax3 = axes[1, 0]
+            ax3.axis('tight')
+            ax3.axis('off')
+            table_data = []
+            headers = ['Category', 'Avg Coverage', 'Min/Max', 'Success/Total', 'Target Met']
+            for category, cat_stats in category_results.items():
+                if cat_stats['total_queries'] > 0:
+                    table_data.append([
+                        category.replace('_', ' ').title(),
+                        f"{cat_stats['average_coverage']:.3f}",
+                        f"{cat_stats['min_coverage']:.3f}/{cat_stats['max_coverage']:.3f}",
+                        f"{cat_stats['successful_evaluations']}/{cat_stats['total_queries']}",
+                        '✅' if cat_stats.get('meets_threshold', False) else '❌'
+                    ])
+            table_data.append([
+                'Overall',
+                f"{overall_results['average_coverage']:.3f}",
+                f"{overall_results['min_coverage']:.3f}/{overall_results['max_coverage']:.3f}",
+                f"{overall_results['successful_queries']}/{overall_results['total_queries']}",
+                '✅' if overall_results.get('meets_threshold', False) else '❌'
+            ])
+            if table_data:
+                table = ax3.table(cellText=table_data, colLabels=headers,
+                                cellLoc='center', loc='center')
+                table.auto_set_font_size(False)
+                table.set_fontsize(10)
+                table.scale(1, 2)
+                # Style header
+                for i in range(len(headers)):
+                    table[(0, i)].set_text_props(weight='bold', color='white')
+                    table[(0, i)].set_facecolor('#2E7D32')
+            ax3.set_title('Coverage Statistics Summary', fontweight='bold', pad=20)
+            # Chart 4: Coverage Performance Radar/Gauge
+            ax4 = axes[1, 1]
+            # Create gauge-like visualization for overall coverage
+            overall_coverage_pct = overall_results['average_coverage'] * 100
+            # Pie chart as gauge
+            sizes = [overall_coverage_pct, 100 - overall_coverage_pct]
+            colors = ['#2ca02c' if overall_coverage_pct >= 60 else '#ff7f0e', '#f0f0f0']
+            wedges, texts, autotexts = ax4.pie(sizes, labels=['Covered', 'Not Covered'],
+                                              autopct='%1.1f%%',
+                                              colors=colors,
+                                              startangle=90,
+                                              counterclock=False)
+            # Add center text
+            ax4.text(0, 0, f'{overall_coverage_pct:.1f}%\nCoverage',
+                    ha='center', va='center', fontsize=14, fontweight='bold')
+            ax4.set_title(f'Overall Coverage Performance\n{"✅ Target Met" if overall_coverage_pct >= 60 else "❌ Below Target"}',
+                         fontweight='bold')
+            plt.tight_layout()
+            # Save chart
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            chart_filename = f"coverage_analysis_charts_{timestamp}.png"
+            results_dir = Path(__file__).parent / "results"
+            results_dir.mkdir(exist_ok=True)
+            chart_path = results_dir / chart_filename
+            plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
+            plt.close()
+            print(f"📈 Coverage charts saved to: {chart_path}")
+            return str(chart_path)
+        except Exception as e:
+            print(f"❌ Coverage chart generation failed: {e}")
+            return ""
+if __name__ == "__main__":
+    """Independent coverage chart generation"""
+    print("📈 OnCall.ai Coverage Chart Generator")
+    chart_gen = CoverageChartGenerator()
+    try:
+        stats = chart_gen.load_latest_coverage_statistics()
+        chart_path = chart_gen.generate_coverage_charts(stats)
+        print(f"\n✅ Coverage chart generation complete!")
+        print(f"📈 Charts saved to: {chart_path}")
+    except FileNotFoundError as e:
+        print(f"❌ {e}")
+        print("💡 Please run latency_evaluator.py first to generate coverage statistics data")
+    except Exception as e:
+        print(f"❌ Chart generation failed: {e}")

evaluation/direct_llm_evaluator.py ADDED Viewed

	@@ -0,0 +1,401 @@

+#!/usr/bin/env python3
+"""
+OnCall.ai System - Direct LLM Evaluator (Med42-70B Only)
+========================================================
+Tests Med42-70B directly without RAG pipeline.
+Only applicable metrics: 1 (Latency), 5 (Actionability), 6 (Evidence Quality)
+Metrics 2-4 (Extraction, Relevance, Coverage) are not applicable for direct LLM.
+Author: YanBo Chen
+Date: 2025-08-04
+"""
+import time
+import json
+import os
+import sys
+from typing import Dict, List, Any
+from datetime import datetime
+from pathlib import Path
+import re
+# Add project path
+current_dir = Path(__file__).parent
+project_root = current_dir.parent
+src_dir = project_root / "src"
+sys.path.insert(0, str(src_dir))
+# Import LLM client only (no retrieval system needed)
+try:
+    from llm_clients import llm_Med42_70BClient
+except ImportError as e:
+    print(f"❌ Import failed: {e}")
+    print("Please ensure running from project root directory")
+    sys.exit(1)
+class DirectLLMEvaluator:
+    """Direct LLM evaluation without RAG pipeline"""
+    def __init__(self):
+        """Initialize direct LLM client only"""
+        print("🔧 Initializing Direct LLM Evaluator...")
+        # Initialize only LLM client (no retrieval, no user_prompt processing)
+        self.llm_client = llm_Med42_70BClient()
+        # Results accumulation
+        self.direct_results = []
+        self.medical_outputs = []
+        print("✅ Direct LLM Evaluator initialization complete")
+    def evaluate_direct_llm_query(self, query: str, category: str = "unknown") -> Dict[str, Any]:
+        """
+        Direct LLM evaluation for single query
+        Only tests direct LLM response without RAG pipeline
+        Applicable metrics: 1 (Latency), 5-6 (via medical output)
+        Args:
+            query: Medical query to test
+            category: Query category (diagnosis/treatment/mixed)
+        """
+        print(f"🔍 Direct LLM evaluation: {query[:50]}...")
+        print(f"📋 Category: {category}")
+        overall_start = time.time()
+        try:
+            # Direct LLM call without any RAG processing
+            llm_start = time.time()
+            # Create direct medical consultation prompt
+            direct_prompt = f"""
+You are a medical expert providing clinical guidance.
+Patient Query: {query}
+Please provide comprehensive medical advice including:
+1. Differential diagnosis (if applicable)
+2. Immediate assessment steps
+3. Treatment recommendations
+4. Clinical considerations
+Provide evidence-based, actionable medical guidance.
+"""
+            # Direct LLM generation
+            response = self.llm_client.generate_completion(direct_prompt)
+            medical_advice = response.get('content', '') if isinstance(response, dict) else str(response)
+            llm_time = time.time() - llm_start
+            total_time = time.time() - overall_start
+            # Create result
+            result = {
+                "query": query,
+                "category": category,
+                # Metric 1: Total Latency (direct LLM call time)
+                "latency_metrics": {
+                    "total_latency": total_time,
+                    "llm_generation_time": llm_time,
+                    "meets_target": total_time <= 30.0
+                },
+                # Metrics 2-4: Not applicable for direct LLM
+                "extraction_metrics": {
+                    "not_applicable": True,
+                    "reason": "No extraction pipeline in direct LLM"
+                },
+                "relevance_metrics": {
+                    "not_applicable": True,
+                    "reason": "No retrieval pipeline in direct LLM"
+                },
+                "coverage_metrics": {
+                    "not_applicable": True,
+                    "reason": "No retrieval content to cover"
+                },
+                # Medical advice for metrics 5-6 evaluation
+                "medical_advice": medical_advice,
+                "advice_length": len(medical_advice),
+                "overall_success": True,
+                "model_type": "Med42-70B_direct",
+                "timestamp": datetime.now().isoformat()
+            }
+            # Store result
+            self.direct_results.append(result)
+            # Store medical output for LLM judge evaluation
+            medical_output = {
+                "query": query,
+                "category": category,
+                "medical_advice": medical_advice,
+                "query_id": f"{category}_query_direct",
+                "model_type": "Med42-70B_direct",
+                "processing_time": total_time,
+                "timestamp": datetime.now().isoformat()
+            }
+            self.medical_outputs.append(medical_output)
+            print(f"✅ Direct LLM completed in {total_time:.2f}s")
+            print(f"📝 Generated advice: {len(medical_advice)} characters")
+            return result
+        except Exception as e:
+            total_time = time.time() - overall_start
+            print(f"❌ Direct LLM evaluation failed after {total_time:.2f}s: {e}")
+            error_result = {
+                "query": query,
+                "category": category,
+                "latency_metrics": {
+                    "total_latency": total_time,
+                    "meets_target": False
+                },
+                "overall_success": False,
+                "error": str(e),
+                "model_type": "Med42-70B_direct",
+                "timestamp": datetime.now().isoformat()
+            }
+            self.direct_results.append(error_result)
+            return error_result
+    def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
+        """Parse queries from file with category labels"""
+        print(f"📁 Reading queries from file: {filepath}")
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                content = f.read()
+            queries_by_category = {
+                "diagnosis": [],
+                "treatment": [],
+                "mixed": []
+            }
+            lines = content.strip().split('\n')
+            for line in lines:
+                line = line.strip()
+                if not line:
+                    continue
+                match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
+                if match:
+                    category_raw = match.group(1).lower()
+                    query_text = match.group(2).strip()
+                    if category_raw in ['mixed/complicated', 'mixed']:
+                        category = 'mixed'
+                    else:
+                        category = category_raw
+                    if category in queries_by_category and len(query_text) > 15:
+                        queries_by_category[category].append({
+                            "text": query_text,
+                            "category": category
+                        })
+            print(f"📋 Parsed queries by category:")
+            for category, category_queries in queries_by_category.items():
+                print(f"  {category.capitalize()}: {len(category_queries)} queries")
+            return queries_by_category
+        except Exception as e:
+            print(f"❌ Failed to read file: {e}")
+            return {"error": f"Failed to read file: {e}"}
+    def calculate_direct_llm_statistics(self) -> Dict[str, Any]:
+        """Calculate statistics for direct LLM evaluation"""
+        successful_results = [r for r in self.direct_results if r.get('overall_success')]
+        if successful_results:
+            latencies = [r['latency_metrics']['total_latency'] for r in successful_results]
+            # Category-wise statistics
+            category_stats = {}
+            results_by_category = {"diagnosis": [], "treatment": [], "mixed": []}
+            for result in successful_results:
+                category = result.get('category', 'unknown')
+                if category in results_by_category:
+                    results_by_category[category].append(result)
+            for category, results in results_by_category.items():
+                if results:
+                    cat_latencies = [r['latency_metrics']['total_latency'] for r in results]
+                    category_stats[category] = {
+                        "average_latency": sum(cat_latencies) / len(cat_latencies),
+                        "query_count": len(cat_latencies),
+                        "target_compliance": sum(1 for lat in cat_latencies if lat <= 30.0) / len(cat_latencies)
+                    }
+                else:
+                    category_stats[category] = {
+                        "average_latency": 0.0,
+                        "query_count": 0,
+                        "target_compliance": 0.0
+                    }
+            # Overall statistics
+            overall_stats = {
+                "average_latency": sum(latencies) / len(latencies),
+                "min_latency": min(latencies),
+                "max_latency": max(latencies),
+                "successful_queries": len(successful_results),
+                "total_queries": len(self.direct_results),
+                "success_rate": len(successful_results) / len(self.direct_results),
+                "target_compliance": sum(1 for lat in latencies if lat <= 30.0) / len(latencies)
+            }
+        else:
+            category_stats = {cat: {"average_latency": 0.0, "query_count": 0, "target_compliance": 0.0}
+                            for cat in ["diagnosis", "treatment", "mixed"]}
+            overall_stats = {
+                "average_latency": 0.0,
+                "successful_queries": 0,
+                "total_queries": len(self.direct_results),
+                "success_rate": 0.0,
+                "target_compliance": 0.0
+            }
+        return {
+            "category_results": category_stats,
+            "overall_results": overall_stats,
+            "model_type": "Med42-70B_direct",
+            "timestamp": datetime.now().isoformat()
+        }
+    def save_direct_llm_statistics(self, filename: str = None) -> str:
+        """Save direct LLM statistics"""
+        stats = self.calculate_direct_llm_statistics()
+        if filename is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"direct_llm_statistics_{timestamp}.json"
+        results_dir = Path(__file__).parent / "results"
+        results_dir.mkdir(exist_ok=True)
+        filepath = results_dir / filename
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(stats, f, indent=2, ensure_ascii=False)
+        print(f"📊 Direct LLM statistics saved to: {filepath}")
+        return str(filepath)
+    def save_direct_medical_outputs(self, filename: str = None) -> str:
+        """Save medical outputs for LLM judge evaluation"""
+        if filename is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"medical_outputs_direct_{timestamp}.json"
+        results_dir = Path(__file__).parent / "results"
+        results_dir.mkdir(exist_ok=True)
+        filepath = results_dir / filename
+        output_data = {
+            "evaluation_metadata": {
+                "total_outputs": len(self.medical_outputs),
+                "categories": list(set(output['category'] for output in self.medical_outputs)),
+                "timestamp": datetime.now().isoformat(),
+                "model_type": "Med42-70B_direct"
+            },
+            "medical_outputs": self.medical_outputs
+        }
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(output_data, f, indent=2, ensure_ascii=False)
+        print(f"📝 Direct medical outputs saved to: {filepath}")
+        return str(filepath)
+# Independent execution interface
+if __name__ == "__main__":
+    """Independent direct LLM evaluation interface"""
+    print("🚀 OnCall.ai Direct LLM Evaluator - Med42-70B Only")
+    if len(sys.argv) > 1:
+        query_file = sys.argv[1]
+    else:
+        query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
+    if not os.path.exists(query_file):
+        print(f"❌ Query file not found: {query_file}")
+        print("Usage: python direct_llm_evaluator.py [query_file.txt]")
+        sys.exit(1)
+    # Initialize evaluator
+    evaluator = DirectLLMEvaluator()
+    # Parse queries
+    queries_by_category = evaluator.parse_queries_from_file(str(query_file))
+    if "error" in queries_by_category:
+        print(f"❌ Failed to parse queries: {queries_by_category['error']}")
+        sys.exit(1)
+    # Test direct LLM for each query
+    print(f"\n🧪 Direct LLM Testing (No RAG Pipeline)")
+    for category, queries in queries_by_category.items():
+        if not queries:
+            continue
+        print(f"\n📂 Testing {category.upper()} with direct Med42-70B:")
+        for i, query_info in enumerate(queries):
+            query_text = query_info['text']
+            # Direct LLM evaluation
+            result = evaluator.evaluate_direct_llm_query(query_text, category)
+            # Pause between queries
+            if i < len(queries) - 1:
+                print(f"   ⏳ Pausing 5s before next query...")
+                time.sleep(5)
+        # Pause between categories
+        if category != list(queries_by_category.keys())[-1]:
+            print(f"\n⏳ Pausing 10s before next category...")
+            time.sleep(10)
+    # Save results
+    print(f"\n📊 Generating direct LLM analysis...")
+    stats_path = evaluator.save_direct_llm_statistics()
+    outputs_path = evaluator.save_direct_medical_outputs()
+    # Print summary
+    stats = evaluator.calculate_direct_llm_statistics()
+    overall_results = stats['overall_results']
+    print(f"\n📊 === DIRECT LLM EVALUATION SUMMARY ===")
+    print(f"Overall Performance:")
+    print(f"   Average Latency: {overall_results['average_latency']:.2f}s")
+    print(f"   Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
+    print(f"   30s Target Compliance: {overall_results['target_compliance']:.1%}")
+    print(f"\nApplicable Metrics:")
+    print(f"   ✅ Metric 1 (Latency): Measured")
+    print(f"   ❌ Metric 2 (Extraction): Not applicable - no extraction pipeline")
+    print(f"   ❌ Metric 3 (Relevance): Not applicable - no retrieval pipeline")
+    print(f"   ❌ Metric 4 (Coverage): Not applicable - no retrieval content")
+    print(f"   🔄 Metric 5 (Actionability): Requires LLM judge evaluation")
+    print(f"   🔄 Metric 6 (Evidence): Requires LLM judge evaluation")
+    print(f"\n✅ Direct LLM evaluation complete!")
+    print(f"📊 Statistics: {stats_path}")
+    print(f"📝 Medical Outputs: {outputs_path}")
+    print(f"\n💡 Next step: Run llm_judge_evaluator.py for metrics 5-6")

evaluation/extraction_chart_generator.py ADDED Viewed

	@@ -0,0 +1,216 @@

+#!/usr/bin/env python3
+"""
+OnCall.ai System - Extraction Chart Generator
+============================================
+Generates extraction success rate charts from saved statistics.
+Reads JSON files produced by comprehensive evaluator.
+Author: YanBo Chen
+Date: 2025-08-04
+"""
+import json
+import os
+import sys
+from typing import Dict, List, Any
+from datetime import datetime
+from pathlib import Path
+import glob
+# Visualization imports
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np
+class ExtractionChartGenerator:
+    """Generate charts for condition extraction metrics"""
+    def __init__(self):
+        """Initialize chart generator"""
+        print("📈 Initializing Extraction Chart Generator...")
+        plt.style.use('default')
+        sns.set_palette("husl")
+        print("✅ Chart Generator ready")
+    def load_latest_extraction_statistics(self, results_dir: str = None) -> Dict[str, Any]:
+        """Load the most recent extraction statistics file"""
+        if results_dir is None:
+            results_dir = Path(__file__).parent / "results"
+        pattern = str(results_dir / "extraction_statistics_*.json")
+        stat_files = glob.glob(pattern)
+        if not stat_files:
+            raise FileNotFoundError(f"No extraction statistics files found in {results_dir}")
+        latest_file = max(stat_files, key=os.path.getmtime)
+        print(f"📊 Loading extraction statistics from: {latest_file}")
+        with open(latest_file, 'r', encoding='utf-8') as f:
+            stats = json.load(f)
+        return stats
+    def generate_extraction_charts(self, stats: Dict[str, Any]) -> str:
+        """Generate extraction success rate analysis charts"""
+        try:
+            fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+            fig.suptitle('OnCall.ai Extraction Success Rate Analysis', fontsize=16, fontweight='bold')
+            category_results = stats['category_results']
+            overall_results = stats['overall_results']
+            # Chart 1: Success Rate by Category
+            ax1 = axes[0, 0]
+            categories = []
+            success_rates = []
+            for category, cat_stats in category_results.items():
+                if cat_stats['total_count'] > 0:
+                    categories.append(category.replace('_', ' ').title())
+                    success_rates.append(cat_stats['success_rate'] * 100)
+            categories.append('Overall')
+            success_rates.append(overall_results['success_rate'] * 100)
+            bars = ax1.bar(categories, success_rates, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
+            ax1.set_title('Extraction Success Rate by Category', fontweight='bold')
+            ax1.set_ylabel('Success Rate (%)')
+            ax1.set_xlabel('Query Category')
+            ax1.grid(True, alpha=0.3)
+            # Add target line
+            ax1.axhline(y=80, color='red', linestyle='--', alpha=0.7, label='80% Target')
+            ax1.legend()
+            # Add value labels
+            for bar, rate in zip(bars, success_rates):
+                height = bar.get_height()
+                ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
+                        f'{rate:.1f}%', ha='center', va='bottom', fontweight='bold')
+            # Chart 2: Success Count
+            ax2 = axes[0, 1]
+            successful_counts = []
+            total_counts = []
+            for category, cat_stats in category_results.items():
+                if cat_stats['total_count'] > 0:
+                    successful_counts.append(cat_stats['successful_count'])
+                    total_counts.append(cat_stats['total_count'])
+            successful_counts.append(overall_results['successful_count'])
+            total_counts.append(overall_results['total_count'])
+            x = np.arange(len(categories))
+            width = 0.35
+            ax2.bar(x - width/2, successful_counts, width, label='Successful', alpha=0.8)
+            ax2.bar(x + width/2, total_counts, width, label='Total', alpha=0.8)
+            ax2.set_title('Extraction Success Count', fontweight='bold')
+            ax2.set_ylabel('Query Count')
+            ax2.set_xlabel('Query Category')
+            ax2.set_xticks(x)
+            ax2.set_xticklabels(categories)
+            ax2.legend()
+            ax2.grid(True, alpha=0.3)
+            # Chart 3: Statistical Summary Table
+            ax3 = axes[1, 0]
+            ax3.axis('tight')
+            ax3.axis('off')
+            table_data = []
+            headers = ['Category', 'Success Rate', 'Success/Total', 'Avg Time (s)', 'Target Met']
+            for category, cat_stats in category_results.items():
+                if cat_stats['total_count'] > 0:
+                    table_data.append([
+                        category.replace('_', ' ').title(),
+                        f"{cat_stats['success_rate']:.1%}",
+                        f"{cat_stats['successful_count']}/{cat_stats['total_count']}",
+                        f"{cat_stats['average_extraction_time']:.3f}",
+                        '✅' if cat_stats.get('meets_threshold', False) else '❌'
+                    ])
+            table_data.append([
+                'Overall',
+                f"{overall_results['success_rate']:.1%}",
+                f"{overall_results['successful_count']}/{overall_results['total_count']}",
+                '-',
+                '✅' if overall_results.get('target_compliance', False) else '❌'
+            ])
+            if table_data:
+                table = ax3.table(cellText=table_data, colLabels=headers,
+                                cellLoc='center', loc='center')
+                table.auto_set_font_size(False)
+                table.set_fontsize(10)
+                table.scale(1, 2)
+                # Style header
+                for i in range(len(headers)):
+                    table[(0, i)].set_text_props(weight='bold', color='white')
+                    table[(0, i)].set_facecolor('#2E7D32')
+            ax3.set_title('Extraction Statistics Summary', fontweight='bold', pad=20)
+            # Chart 4: Performance visualization
+            ax4 = axes[1, 1]
+            # Simple performance indicator
+            overall_rate = overall_results['success_rate'] * 100
+            colors = ['#d62728' if overall_rate < 80 else '#2ca02c']
+            wedges, texts, autotexts = ax4.pie([overall_rate, 100-overall_rate],
+                                              labels=['Successful', 'Failed'],
+                                              autopct='%1.1f%%',
+                                              colors=['#2ca02c', '#ffcccc'],
+                                              startangle=90)
+            ax4.set_title(f'Overall Extraction Success\n{overall_rate:.1f}% Success Rate', fontweight='bold')
+            plt.tight_layout()
+            # Save chart
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            chart_filename = f"extraction_analysis_charts_{timestamp}.png"
+            results_dir = Path(__file__).parent / "results"
+            results_dir.mkdir(exist_ok=True)
+            chart_path = results_dir / chart_filename
+            plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
+            plt.close()
+            print(f"📈 Extraction charts saved to: {chart_path}")
+            return str(chart_path)
+        except Exception as e:
+            print(f"❌ Extraction chart generation failed: {e}")
+            return ""
+if __name__ == "__main__":
+    """Independent extraction chart generation"""
+    print("📈 OnCall.ai Extraction Chart Generator")
+    chart_gen = ExtractionChartGenerator()
+    try:
+        stats = chart_gen.load_latest_extraction_statistics()
+        chart_path = chart_gen.generate_extraction_charts(stats)
+        print(f"\n✅ Extraction chart generation complete!")
+        print(f"📈 Charts saved to: {chart_path}")
+    except FileNotFoundError as e:
+        print(f"❌ {e}")
+        print("💡 Please run latency_evaluator.py first to generate extraction statistics data")
+    except Exception as e:
+        print(f"❌ Chart generation failed: {e}")

evaluation/llm_judge_evaluator.py ADDED Viewed

	@@ -0,0 +1,401 @@

+#!/usr/bin/env python3
+"""
+OnCall.ai System - LLM Judge Evaluator (Metrics 5-6)
+====================================================
+Uses Llama3-70B as third-party judge to evaluate medical advice quality.
+Batch evaluation strategy: 1 call evaluates all queries for maximum efficiency.
+Metrics evaluated:
+5. Clinical Actionability (臨床可操作性)
+6. Clinical Evidence Quality (臨床證據品質)
+Author: YanBo Chen
+Date: 2025-08-04
+"""
+import json
+import os
+import sys
+from typing import Dict, List, Any, Tuple
+from datetime import datetime
+from pathlib import Path
+import glob
+import re
+# Add project path
+current_dir = Path(__file__).parent
+project_root = current_dir.parent
+src_dir = project_root / "src"
+sys.path.insert(0, str(src_dir))
+# Import LLM client for judge evaluation
+try:
+    from llm_clients import llm_Med42_70BClient  # Temporarily use Med42 as placeholder
+    # TODO: Replace with actual Llama3-70B client when available
+except ImportError as e:
+    print(f"❌ Import failed: {e}")
+    print("Please ensure running from project root directory")
+    sys.exit(1)
+class LLMJudgeEvaluator:
+    """LLM judge evaluator using batch evaluation strategy"""
+    def __init__(self):
+        """Initialize judge LLM client"""
+        print("🔧 Initializing LLM Judge Evaluator...")
+        # TODO: Replace with actual Llama3-70B client
+        # For now, using Med42 as placeholder
+        self.judge_llm = llm_Med42_70BClient()
+        print("⚠️ Note: Using Med42 as placeholder for Llama3-70B judge")
+        self.evaluation_results = []
+        print("✅ LLM Judge Evaluator initialization complete")
+    def load_medical_outputs(self, filepath: str) -> List[Dict[str, Any]]:
+        """Load medical outputs from file"""
+        print(f"📁 Loading medical outputs from: {filepath}")
+        with open(filepath, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        medical_outputs = data.get('medical_outputs', [])
+        print(f"📋 Loaded {len(medical_outputs)} medical outputs")
+        return medical_outputs
+    def find_latest_medical_outputs(self, model_type: str = "rag") -> str:
+        """Find the latest medical outputs file"""
+        results_dir = Path(__file__).parent / "results"
+        if model_type == "rag":
+            pattern = str(results_dir / "medical_outputs_*.json")
+        else:  # direct
+            pattern = str(results_dir / "medical_outputs_direct_*.json")
+        output_files = glob.glob(pattern)
+        if not output_files:
+            raise FileNotFoundError(f"No medical outputs files found for {model_type} model")
+        latest_file = max(output_files, key=os.path.getmtime)
+        print(f"📊 Found latest medical outputs: {latest_file}")
+        return latest_file
+    def create_batch_evaluation_prompt(self, medical_outputs: List[Dict[str, Any]]) -> str:
+        """
+        Create batch evaluation prompt for all queries at once
+        Maximum efficiency: 1 LLM call evaluates all queries
+        """
+        prompt_parts = [
+            "You are a medical expert evaluating clinical advice quality.",
+            "Please evaluate each medical advice response on TWO criteria:",
+            "",
+            "CRITERIA:",
+            "1. Clinical Actionability (1-10): Can healthcare providers immediately act on this advice?",
+            "2. Clinical Evidence Quality (1-10): Is the advice evidence-based and follows medical standards?",
+            "",
+            "QUERIES TO EVALUATE:",
+            ""
+        ]
+        # Add each query and advice
+        for i, output in enumerate(medical_outputs, 1):
+            query = output.get('query', '')
+            advice = output.get('medical_advice', '')
+            category = output.get('category', 'unknown')
+            prompt_parts.extend([
+                f"=== QUERY {i} ({category.upper()}) ===",
+                f"Patient Query: {query}",
+                f"Medical Advice: {advice}",
+                ""
+            ])
+        prompt_parts.extend([
+            "RESPONSE FORMAT (provide exactly this format):",
+            ""
+        ])
+        # Add response format template
+        for i in range(1, len(medical_outputs) + 1):
+            prompt_parts.append(f"Query {i}: Actionability=X, Evidence=Y")
+        prompt_parts.extend([
+            "",
+            "Replace X and Y with numeric scores 1-10.",
+            "Provide only the scores in the exact format above."
+        ])
+        return "\n".join(prompt_parts)
+    def parse_batch_evaluation_response(self, response: str, medical_outputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Parse batch evaluation response into individual scores"""
+        results = []
+        # Parse response format: "Query 1: Actionability=8, Evidence=7"
+        lines = response.strip().split('\n')
+        for i, line in enumerate(lines):
+            line = line.strip()
+            if not line:
+                continue
+            # Try to match pattern: "Query X: Actionability=Y, Evidence=Z"
+            match = re.match(r'Query\s+(\d+):\s*Actionability\s*=\s*(\d+)\s*,\s*Evidence\s*=\s*(\d+)', line, re.IGNORECASE)
+            if match:
+                query_num = int(match.group(1)) - 1  # Convert to 0-based index
+                actionability_score = int(match.group(2))
+                evidence_score = int(match.group(3))
+                if query_num < len(medical_outputs):
+                    output = medical_outputs[query_num]
+                    result = {
+                        "query": output.get('query', ''),
+                        "category": output.get('category', 'unknown'),
+                        "model_type": output.get('model_type', 'unknown'),
+                        "medical_advice": output.get('medical_advice', ''),
+                        # Metric 5: Clinical Actionability
+                        "actionability_score": actionability_score / 10.0,  # Normalize to 0-1
+                        "actionability_raw": actionability_score,
+                        # Metric 6: Clinical Evidence Quality
+                        "evidence_score": evidence_score / 10.0,  # Normalize to 0-1
+                        "evidence_raw": evidence_score,
+                        "evaluation_success": True,
+                        "timestamp": datetime.now().isoformat()
+                    }
+                    results.append(result)
+        return results
+    def evaluate_batch_medical_outputs(self, medical_outputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Batch evaluate all medical outputs using single LLM call
+        Args:
+            medical_outputs: List of medical advice outputs to evaluate
+        """
+        print(f"🧠 Batch evaluating {len(medical_outputs)} medical outputs...")
+        try:
+            # Create batch evaluation prompt
+            batch_prompt = self.create_batch_evaluation_prompt(medical_outputs)
+            print(f"📝 Batch prompt created ({len(batch_prompt)} characters)")
+            print(f"🔄 Calling judge LLM for batch evaluation...")
+            # Single LLM call for all evaluations
+            eval_start = time.time()
+            response = self.judge_llm.generate_completion(batch_prompt)
+            eval_time = time.time() - eval_start
+            # Extract response text
+            response_text = response.get('content', '') if isinstance(response, dict) else str(response)
+            print(f"✅ Judge LLM completed batch evaluation in {eval_time:.2f}s")
+            print(f"📄 Response length: {len(response_text)} characters")
+            # Parse batch response
+            parsed_results = self.parse_batch_evaluation_response(response_text, medical_outputs)
+            if len(parsed_results) != len(medical_outputs):
+                print(f"⚠️ Warning: Expected {len(medical_outputs)} results, got {len(parsed_results)}")
+            self.evaluation_results.extend(parsed_results)
+            print(f"📊 Successfully parsed {len(parsed_results)} evaluation results")
+            return parsed_results
+        except Exception as e:
+            print(f"❌ Batch evaluation failed: {e}")
+            # Create error results for all outputs
+            error_results = []
+            for output in medical_outputs:
+                error_result = {
+                    "query": output.get('query', ''),
+                    "category": output.get('category', 'unknown'),
+                    "model_type": output.get('model_type', 'unknown'),
+                    "actionability_score": 0.0,
+                    "evidence_score": 0.0,
+                    "evaluation_success": False,
+                    "error": str(e),
+                    "timestamp": datetime.now().isoformat()
+                }
+                error_results.append(error_result)
+            self.evaluation_results.extend(error_results)
+            return error_results
+    def calculate_judge_statistics(self) -> Dict[str, Any]:
+        """Calculate statistics for LLM judge evaluation"""
+        successful_results = [r for r in self.evaluation_results if r.get('evaluation_success')]
+        if not successful_results:
+            return {
+                "category_results": {},
+                "overall_results": {
+                    "average_actionability": 0.0,
+                    "average_evidence": 0.0,
+                    "successful_evaluations": 0,
+                    "total_queries": len(self.evaluation_results)
+                },
+                "timestamp": datetime.now().isoformat()
+            }
+        # Group by category
+        results_by_category = {"diagnosis": [], "treatment": [], "mixed": []}
+        for result in successful_results:
+            category = result.get('category', 'unknown')
+            if category in results_by_category:
+                results_by_category[category].append(result)
+        # Calculate category statistics
+        category_stats = {}
+        for category, results in results_by_category.items():
+            if results:
+                actionability_scores = [r['actionability_score'] for r in results]
+                evidence_scores = [r['evidence_score'] for r in results]
+                category_stats[category] = {
+                    "average_actionability": sum(actionability_scores) / len(actionability_scores),
+                    "average_evidence": sum(evidence_scores) / len(evidence_scores),
+                    "query_count": len(results),
+                    "actionability_target_met": (sum(actionability_scores) / len(actionability_scores)) >= 0.7,
+                    "evidence_target_met": (sum(evidence_scores) / len(evidence_scores)) >= 0.75,
+                    "individual_actionability_scores": actionability_scores,
+                    "individual_evidence_scores": evidence_scores
+                }
+            else:
+                category_stats[category] = {
+                    "average_actionability": 0.0,
+                    "average_evidence": 0.0,
+                    "query_count": 0,
+                    "actionability_target_met": False,
+                    "evidence_target_met": False,
+                    "individual_actionability_scores": [],
+                    "individual_evidence_scores": []
+                }
+        # Calculate overall statistics
+        all_actionability = [r['actionability_score'] for r in successful_results]
+        all_evidence = [r['evidence_score'] for r in successful_results]
+        overall_stats = {
+            "average_actionability": sum(all_actionability) / len(all_actionability),
+            "average_evidence": sum(all_evidence) / len(all_evidence),
+            "successful_evaluations": len(successful_results),
+            "total_queries": len(self.evaluation_results),
+            "actionability_target_met": (sum(all_actionability) / len(all_actionability)) >= 0.7,
+            "evidence_target_met": (sum(all_evidence) / len(all_evidence)) >= 0.75
+        }
+        return {
+            "category_results": category_stats,
+            "overall_results": overall_stats,
+            "timestamp": datetime.now().isoformat()
+        }
+    def save_judge_statistics(self, model_type: str, filename: str = None) -> str:
+        """Save judge evaluation statistics"""
+        stats = self.calculate_judge_statistics()
+        if filename is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"judge_evaluation_{model_type}_{timestamp}.json"
+        results_dir = Path(__file__).parent / "results"
+        results_dir.mkdir(exist_ok=True)
+        filepath = results_dir / filename
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(stats, f, indent=2, ensure_ascii=False)
+        print(f"📊 Judge evaluation statistics saved to: {filepath}")
+        return str(filepath)
+# Independent execution interface
+if __name__ == "__main__":
+    """Independent LLM judge evaluation interface"""
+    print("🧠 OnCall.ai LLM Judge Evaluator - Metrics 5-6 Batch Evaluation")
+    if len(sys.argv) > 1 and sys.argv[1] in ['rag', 'direct']:
+        model_type = sys.argv[1]
+    else:
+        print("Usage: python llm_judge_evaluator.py [rag|direct]")
+        print("  rag    - Evaluate RAG system medical outputs")
+        print("  direct - Evaluate direct LLM medical outputs")
+        sys.exit(1)
+    # Initialize evaluator
+    evaluator = LLMJudgeEvaluator()
+    try:
+        # Find and load latest medical outputs
+        outputs_file = evaluator.find_latest_medical_outputs(model_type)
+        medical_outputs = evaluator.load_medical_outputs(outputs_file)
+        if not medical_outputs:
+            print(f"❌ No medical outputs found in {outputs_file}")
+            sys.exit(1)
+        # Batch evaluate all outputs
+        print(f"\n🧪 Batch LLM Judge Evaluation for {model_type.upper()} model")
+        print(f"📊 Evaluating {len(medical_outputs)} medical advice outputs")
+        print(f"🎯 Metrics: 5 (Actionability) + 6 (Evidence Quality)")
+        print(f"⚡ Strategy: Single batch call for maximum efficiency")
+        evaluation_results = evaluator.evaluate_batch_medical_outputs(medical_outputs)
+        # Save results
+        print(f"\n📊 Generating judge evaluation analysis...")
+        stats_path = evaluator.save_judge_statistics(model_type)
+        # Print summary
+        stats = evaluator.calculate_judge_statistics()
+        overall_results = stats['overall_results']
+        category_results = stats['category_results']
+        print(f"\n📊 === LLM JUDGE EVALUATION SUMMARY ({model_type.upper()}) ===")
+        print(f"Overall Performance:")
+        print(f"   Average Actionability: {overall_results['average_actionability']:.3f} ({overall_results['average_actionability']*10:.1f}/10)")
+        print(f"   Average Evidence Quality: {overall_results['average_evidence']:.3f} ({overall_results['average_evidence']*10:.1f}/10)")
+        print(f"   Actionability Target (≥7.0): {'✅ Met' if overall_results['actionability_target_met'] else '❌ Not Met'}")
+        print(f"   Evidence Target (≥7.5): {'✅ Met' if overall_results['evidence_target_met'] else '❌ Not Met'}")
+        print(f"\nCategory Breakdown:")
+        for category, cat_stats in category_results.items():
+            if cat_stats['query_count'] > 0:
+                print(f"   {category.capitalize()}: "
+                      f"Actionability={cat_stats['average_actionability']:.2f}, "
+                      f"Evidence={cat_stats['average_evidence']:.2f} "
+                      f"[{cat_stats['query_count']} queries]")
+        print(f"\n✅ LLM judge evaluation complete!")
+        print(f"📊 Statistics: {stats_path}")
+        print(f"⚡ Efficiency: {len(medical_outputs)} evaluations in 1 LLM call")
+    except FileNotFoundError as e:
+        print(f"❌ {e}")
+        print(f"💡 Please run evaluator first:")
+        if model_type == "rag":
+            print("   python latency_evaluator.py pre_user_query_evaluate.txt")
+        else:
+            print("   python direct_llm_evaluator.py pre_user_query_evaluate.txt")
+    except Exception as e:
+        print(f"❌ Judge evaluation failed: {e}")

evaluation/relevance_chart_generator.py ADDED Viewed

	@@ -0,0 +1,231 @@

+#!/usr/bin/env python3
+"""
+OnCall.ai System - Relevance Chart Generator
+============================================
+Generates retrieval relevance charts from saved statistics.
+Shows cosine similarity analysis and threshold compliance.
+Author: YanBo Chen
+Date: 2025-08-04
+"""
+import json
+import os
+import sys
+from typing import Dict, List, Any
+from datetime import datetime
+from pathlib import Path
+import glob
+# Visualization imports
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np
+class RelevanceChartGenerator:
+    """Generate charts for retrieval relevance metrics"""
+    def __init__(self):
+        """Initialize chart generator"""
+        print("📈 Initializing Relevance Chart Generator...")
+        plt.style.use('default')
+        sns.set_palette("husl")
+        print("✅ Chart Generator ready")
+    def load_latest_relevance_statistics(self, results_dir: str = None) -> Dict[str, Any]:
+        """Load the most recent relevance statistics file"""
+        if results_dir is None:
+            results_dir = Path(__file__).parent / "results"
+        pattern = str(results_dir / "relevance_statistics_*.json")
+        stat_files = glob.glob(pattern)
+        if not stat_files:
+            raise FileNotFoundError(f"No relevance statistics files found in {results_dir}")
+        latest_file = max(stat_files, key=os.path.getmtime)
+        print(f"📊 Loading relevance statistics from: {latest_file}")
+        with open(latest_file, 'r', encoding='utf-8') as f:
+            stats = json.load(f)
+        return stats
+    def generate_relevance_charts(self, stats: Dict[str, Any]) -> str:
+        """Generate relevance analysis charts"""
+        try:
+            fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+            fig.suptitle('OnCall.ai Retrieval Relevance Analysis', fontsize=16, fontweight='bold')
+            category_results = stats['category_results']
+            overall_results = stats['overall_results']
+            # Chart 1: Average Relevance by Category
+            ax1 = axes[0, 0]
+            categories = []
+            avg_relevances = []
+            for category, cat_stats in category_results.items():
+                if cat_stats['successful_retrievals'] > 0:
+                    categories.append(category.replace('_', ' ').title())
+                    avg_relevances.append(cat_stats['average_relevance'])
+            categories.append('Overall')
+            avg_relevances.append(overall_results['average_relevance'])
+            bars = ax1.bar(categories, avg_relevances, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
+            ax1.set_title('Average Relevance Score by Category', fontweight='bold')
+            ax1.set_ylabel('Relevance Score (Cosine Similarity)')
+            ax1.set_xlabel('Query Category')
+            ax1.grid(True, alpha=0.3)
+            # Add threshold lines
+            ax1.axhline(y=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
+            ax1.axhline(y=0.25, color='red', linestyle='--', alpha=0.7, label='0.25 Target')
+            ax1.legend()
+            # Add value labels
+            for bar, relevance in zip(bars, avg_relevances):
+                height = bar.get_height()
+                ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
+                        f'{relevance:.3f}', ha='center', va='bottom', fontweight='bold')
+            # Chart 2: Relevance Distribution
+            ax2 = axes[0, 1]
+            # Collect all individual relevance scores
+            all_scores = []
+            category_labels = []
+            for category, cat_stats in category_results.items():
+                if cat_stats.get('individual_relevance_scores'):
+                    all_scores.extend(cat_stats['individual_relevance_scores'])
+                    category_labels.extend([category] * len(cat_stats['individual_relevance_scores']))
+            if all_scores:
+                # Create histogram
+                ax2.hist(all_scores, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
+                ax2.axvline(x=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
+                ax2.axvline(x=0.25, color='red', linestyle='--', alpha=0.7, label='0.25 Target')
+                ax2.axvline(x=np.mean(all_scores), color='green', linestyle='-', alpha=0.8, label=f'Mean: {np.mean(all_scores):.3f}')
+                ax2.set_title('Relevance Score Distribution', fontweight='bold')
+                ax2.set_xlabel('Relevance Score')
+                ax2.set_ylabel('Frequency')
+                ax2.legend()
+                ax2.grid(True, alpha=0.3)
+            else:
+                ax2.text(0.5, 0.5, 'No relevance data available', ha='center', va='center', transform=ax2.transAxes)
+                ax2.set_title('Relevance Score Distribution', fontweight='bold')
+            # Chart 3: Statistical Summary Table
+            ax3 = axes[1, 0]
+            ax3.axis('tight')
+            ax3.axis('off')
+            table_data = []
+            headers = ['Category', 'Avg Relevance', 'Min/Max', 'Success/Total', 'Threshold Met']
+            for category, cat_stats in category_results.items():
+                if cat_stats['total_queries'] > 0:
+                    table_data.append([
+                        category.replace('_', ' ').title(),
+                        f"{cat_stats['average_relevance']:.3f}",
+                        f"{cat_stats['min_relevance']:.3f}/{cat_stats['max_relevance']:.3f}",
+                        f"{cat_stats['successful_retrievals']}/{cat_stats['total_queries']}",
+                        '✅' if cat_stats.get('meets_threshold', False) else '❌'
+                    ])
+            table_data.append([
+                'Overall',
+                f"{overall_results['average_relevance']:.3f}",
+                f"{overall_results['min_relevance']:.3f}/{overall_results['max_relevance']:.3f}",
+                f"{overall_results['successful_queries']}/{overall_results['total_queries']}",
+                '✅' if overall_results.get('target_compliance', False) else '❌'
+            ])
+            if table_data:
+                table = ax3.table(cellText=table_data, colLabels=headers,
+                                cellLoc='center', loc='center')
+                table.auto_set_font_size(False)
+                table.set_fontsize(10)
+                table.scale(1, 2)
+                # Style header
+                for i in range(len(headers)):
+                    table[(0, i)].set_text_props(weight='bold', color='white')
+                    table[(0, i)].set_facecolor('#2E7D32')
+            ax3.set_title('Relevance Statistics Summary', fontweight='bold', pad=20)
+            # Chart 4: Category Comparison Box Plot
+            ax4 = axes[1, 1]
+            box_data = []
+            box_labels = []
+            for category, cat_stats in category_results.items():
+                if cat_stats.get('individual_relevance_scores'):
+                    box_data.append(cat_stats['individual_relevance_scores'])
+                    box_labels.append(category.replace('_', ' ').title())
+            if box_data:
+                box_plot = ax4.boxplot(box_data, labels=box_labels, patch_artist=True)
+                colors = ['#1f77b4', '#ff7f0e', '#d62728']
+                for patch, color in zip(box_plot['boxes'], colors[:len(box_plot['boxes'])]):
+                    patch.set_facecolor(color)
+                    patch.set_alpha(0.7)
+                ax4.axhline(y=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
+                ax4.axhline(y=0.25, color='red', linestyle='--', alpha=0.7, label='0.25 Target')
+                ax4.set_title('Relevance Distribution by Category', fontweight='bold')
+                ax4.set_ylabel('Relevance Score')
+                ax4.legend()
+                ax4.grid(True, alpha=0.3)
+            else:
+                ax4.text(0.5, 0.5, 'Insufficient data for box plot', ha='center', va='center', transform=ax4.transAxes)
+                ax4.set_title('Relevance Distribution by Category', fontweight='bold')
+            plt.tight_layout()
+            # Save chart
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            chart_filename = f"relevance_analysis_charts_{timestamp}.png"
+            results_dir = Path(__file__).parent / "results"
+            results_dir.mkdir(exist_ok=True)
+            chart_path = results_dir / chart_filename
+            plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
+            plt.close()
+            print(f"📈 Relevance charts saved to: {chart_path}")
+            return str(chart_path)
+        except Exception as e:
+            print(f"❌ Relevance chart generation failed: {e}")
+            return ""
+if __name__ == "__main__":
+    """Independent relevance chart generation"""
+    print("📈 OnCall.ai Relevance Chart Generator")
+    chart_gen = RelevanceChartGenerator()
+    try:
+        stats = chart_gen.load_latest_relevance_statistics()
+        chart_path = chart_gen.generate_relevance_charts(stats)
+        print(f"\n✅ Relevance chart generation complete!")
+        print(f"📈 Charts saved to: {chart_path}")
+    except FileNotFoundError as e:
+        print(f"❌ {e}")
+        print("💡 Please run latency_evaluator.py first to generate relevance statistics data")
+    except Exception as e:
+        print(f"❌ Chart generation failed: {e}")