Spaces:
Runtime error
Runtime error
| """ | |
| Gap Analyzer Module | |
| Analyzes model performance to identify knowledge gaps and weak areas. | |
| """ | |
| from typing import List, Dict, Optional, Any, Tuple | |
| import json | |
| from pathlib import Path | |
| from collections import defaultdict | |
| import statistics | |
| class GapAnalyzer: | |
| """ | |
| Analyzes evaluation results to identify knowledge gaps. | |
| Features: | |
| - Topic-level performance analysis | |
| - Trend tracking across evaluations | |
| - Weakness identification | |
| - Strength identification | |
| - Improvement recommendations | |
| """ | |
| def __init__(self): | |
| """Initialize gap analyzer.""" | |
| self.evaluation_history: List[Dict[str, Any]] = [] | |
| self.performance_by_category: Dict[str, List[float]] = defaultdict(list) | |
| self.gaps: List[Dict[str, Any]] = [] | |
| def add_evaluation_results(self, results: Dict[str, Any]): | |
| """ | |
| Add evaluation results for analysis. | |
| Args: | |
| results: Evaluation results dictionary | |
| """ | |
| self.evaluation_history.append(results) | |
| # Extract category performance if available | |
| if 'examples' in results: | |
| category_scores = defaultdict(list) | |
| for example in results['examples']: | |
| category = example.get('category', 'general') | |
| # Calculate score for this example | |
| prediction = example.get('prediction', '').lower() | |
| reference = example.get('reference', '').lower() | |
| # Simple scoring: 1 if similar, 0 otherwise | |
| score = 1.0 if self._calculate_similarity(prediction, reference) > 0.5 else 0.0 | |
| category_scores[category].append(score) | |
| # Store average scores by category | |
| for category, scores in category_scores.items(): | |
| avg_score = (sum(scores) / len(scores)) * 100 if scores else 0 | |
| self.performance_by_category[category].append(avg_score) | |
| def _calculate_similarity(self, text1: str, text2: str) -> float: | |
| """Calculate simple similarity between two texts.""" | |
| words1 = set(text1.split()) | |
| words2 = set(text2.split()) | |
| if not words1 or not words2: | |
| return 0.0 | |
| intersection = words1 & words2 | |
| union = words1 | words2 | |
| return len(intersection) / len(union) if union else 0.0 | |
| def analyze_gaps( | |
| self, | |
| weak_threshold: float = 60.0, | |
| strong_threshold: float = 85.0 | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Analyze performance and identify gaps. | |
| Args: | |
| weak_threshold: Score below this is considered weak | |
| strong_threshold: Score above this is considered strong | |
| Returns: | |
| List of identified gaps with details | |
| """ | |
| gaps = [] | |
| # Analyze each category | |
| for category, scores in self.performance_by_category.items(): | |
| if not scores: | |
| continue | |
| avg_score = statistics.mean(scores) | |
| latest_score = scores[-1] if scores else 0 | |
| # Calculate trend | |
| trend = "stable" | |
| if len(scores) >= 2: | |
| recent_avg = statistics.mean(scores[-3:]) if len(scores) >= 3 else statistics.mean(scores[-2:]) | |
| older_avg = statistics.mean(scores[:-3]) if len(scores) >= 3 else scores[0] | |
| if recent_avg > older_avg + 5: | |
| trend = "improving" | |
| elif recent_avg < older_avg - 5: | |
| trend = "declining" | |
| # Classify performance level | |
| if avg_score < weak_threshold: | |
| level = "WEAK" | |
| priority = "HIGH" | |
| elif avg_score < strong_threshold: | |
| level = "MODERATE" | |
| priority = "MEDIUM" | |
| else: | |
| level = "STRONG" | |
| priority = "LOW" | |
| gap = { | |
| 'category': category, | |
| 'avg_score': avg_score, | |
| 'latest_score': latest_score, | |
| 'num_evaluations': len(scores), | |
| 'trend': trend, | |
| 'level': level, | |
| 'priority': priority, | |
| 'scores_history': scores | |
| } | |
| gaps.append(gap) | |
| # Sort by priority (weak areas first) | |
| priority_order = {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2} | |
| gaps.sort(key=lambda x: (priority_order.get(x['priority'], 3), x['avg_score'])) | |
| self.gaps = gaps | |
| return gaps | |
| def get_weakest_topics(self, n: int = 5) -> List[Dict[str, Any]]: | |
| """ | |
| Get the N weakest topics. | |
| Args: | |
| n: Number of topics to return | |
| Returns: | |
| List of weakest topics | |
| """ | |
| if not self.gaps: | |
| self.analyze_gaps() | |
| weak_gaps = [g for g in self.gaps if g['level'] in ['WEAK', 'MODERATE']] | |
| return weak_gaps[:n] | |
| def get_strongest_topics(self, n: int = 5) -> List[Dict[str, Any]]: | |
| """ | |
| Get the N strongest topics. | |
| Args: | |
| n: Number of topics to return | |
| Returns: | |
| List of strongest topics | |
| """ | |
| if not self.gaps: | |
| self.analyze_gaps() | |
| strong_gaps = [g for g in self.gaps if g['level'] == 'STRONG'] | |
| return strong_gaps[:n] | |
| def get_declining_topics(self) -> List[Dict[str, Any]]: | |
| """Get topics with declining performance.""" | |
| if not self.gaps: | |
| self.analyze_gaps() | |
| return [g for g in self.gaps if g['trend'] == 'declining'] | |
| def get_improving_topics(self) -> List[Dict[str, Any]]: | |
| """Get topics with improving performance.""" | |
| if not self.gaps: | |
| self.analyze_gaps() | |
| return [g for g in self.gaps if g['trend'] == 'improving'] | |
| def generate_gap_report(self) -> str: | |
| """ | |
| Generate a human-readable gap analysis report. | |
| Returns: | |
| Formatted report string | |
| """ | |
| if not self.gaps: | |
| self.analyze_gaps() | |
| report = ["=" * 80] | |
| report.append("KNOWLEDGE GAP ANALYSIS REPORT") | |
| report.append("=" * 80) | |
| report.append("") | |
| # Overall summary | |
| weak_count = sum(1 for g in self.gaps if g['level'] == 'WEAK') | |
| moderate_count = sum(1 for g in self.gaps if g['level'] == 'MODERATE') | |
| strong_count = sum(1 for g in self.gaps if g['level'] == 'STRONG') | |
| report.append(f"Total Categories Analyzed: {len(self.gaps)}") | |
| report.append(f" - WEAK (needs immediate attention): {weak_count}") | |
| report.append(f" - MODERATE (needs improvement): {moderate_count}") | |
| report.append(f" - STRONG (performing well): {strong_count}") | |
| report.append("") | |
| # Weak areas (priority) | |
| weak_topics = [g for g in self.gaps if g['level'] == 'WEAK'] | |
| if weak_topics: | |
| report.append("🔴 WEAK AREAS (Priority Training Needed):") | |
| report.append("-" * 80) | |
| for gap in weak_topics: | |
| report.append(f" • {gap['category']}: {gap['avg_score']:.1f}% (Trend: {gap['trend']})") | |
| report.append("") | |
| # Moderate areas | |
| moderate_topics = [g for g in self.gaps if g['level'] == 'MODERATE'] | |
| if moderate_topics: | |
| report.append("🟡 MODERATE AREAS (Recommended Improvement):") | |
| report.append("-" * 80) | |
| for gap in moderate_topics[:5]: # Top 5 | |
| report.append(f" • {gap['category']}: {gap['avg_score']:.1f}% (Trend: {gap['trend']})") | |
| report.append("") | |
| # Strong areas | |
| strong_topics = [g for g in self.gaps if g['level'] == 'STRONG'] | |
| if strong_topics: | |
| report.append("🟢 STRONG AREAS (Excellent Performance):") | |
| report.append("-" * 80) | |
| for gap in strong_topics[:5]: # Top 5 | |
| report.append(f" • {gap['category']}: {gap['avg_score']:.1f}% (Trend: {gap['trend']})") | |
| report.append("") | |
| # Trends | |
| declining = self.get_declining_topics() | |
| improving = self.get_improving_topics() | |
| if declining: | |
| report.append("📉 DECLINING PERFORMANCE (Needs Attention):") | |
| report.append("-" * 80) | |
| for gap in declining: | |
| report.append(f" • {gap['category']}: {gap['avg_score']:.1f}%") | |
| report.append("") | |
| if improving: | |
| report.append("📈 IMPROVING PERFORMANCE (Keep It Up!):") | |
| report.append("-" * 80) | |
| for gap in improving: | |
| report.append(f" • {gap['category']}: {gap['avg_score']:.1f}%") | |
| report.append("") | |
| report.append("=" * 80) | |
| return "\n".join(report) | |
| def get_performance_summary(self) -> Dict[str, Any]: | |
| """ | |
| Get overall performance summary. | |
| Returns: | |
| Summary statistics | |
| """ | |
| if not self.gaps: | |
| self.analyze_gaps() | |
| all_scores = [g['avg_score'] for g in self.gaps] | |
| summary = { | |
| 'num_categories': len(self.gaps), | |
| 'overall_avg_score': statistics.mean(all_scores) if all_scores else 0, | |
| 'min_score': min(all_scores) if all_scores else 0, | |
| 'max_score': max(all_scores) if all_scores else 0, | |
| 'weak_count': sum(1 for g in self.gaps if g['level'] == 'WEAK'), | |
| 'moderate_count': sum(1 for g in self.gaps if g['level'] == 'MODERATE'), | |
| 'strong_count': sum(1 for g in self.gaps if g['level'] == 'STRONG'), | |
| 'declining_count': sum(1 for g in self.gaps if g['trend'] == 'declining'), | |
| 'improving_count': sum(1 for g in self.gaps if g['trend'] == 'improving') | |
| } | |
| return summary | |
| def export_gaps(self, filepath: str): | |
| """ | |
| Export gap analysis to JSON file. | |
| Args: | |
| filepath: Output file path | |
| """ | |
| if not self.gaps: | |
| self.analyze_gaps() | |
| Path(filepath).parent.mkdir(parents=True, exist_ok=True) | |
| data = { | |
| 'summary': self.get_performance_summary(), | |
| 'gaps': self.gaps, | |
| 'report': self.generate_gap_report() | |
| } | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| print(f"Gap analysis exported to: {filepath}") | |
| def load_gaps(self, filepath: str): | |
| """ | |
| Load gap analysis from JSON file. | |
| Args: | |
| filepath: Input file path | |
| """ | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| self.gaps = data.get('gaps', []) | |
| # Reconstruct performance_by_category | |
| for gap in self.gaps: | |
| category = gap['category'] | |
| scores = gap.get('scores_history', []) | |
| self.performance_by_category[category] = scores | |
| def compare_evaluations( | |
| self, | |
| eval1: Dict[str, Any], | |
| eval2: Dict[str, Any] | |
| ) -> Dict[str, Any]: | |
| """ | |
| Compare two evaluation results. | |
| Args: | |
| eval1: First evaluation results | |
| eval2: Second evaluation results | |
| Returns: | |
| Comparison details | |
| """ | |
| comparison = { | |
| 'improvement': {}, | |
| 'decline': {}, | |
| 'stable': {} | |
| } | |
| # Extract metrics from both | |
| metrics1 = eval1.get('metrics', {}) | |
| metrics2 = eval2.get('metrics', {}) | |
| # Compare each metric | |
| for metric in set(metrics1.keys()) | set(metrics2.keys()): | |
| if metric in metrics1 and metric in metrics2: | |
| val1 = metrics1[metric] | |
| val2 = metrics2[metric] | |
| if isinstance(val1, (int, float)) and isinstance(val2, (int, float)): | |
| diff = val2 - val1 | |
| percent_change = (diff / val1 * 100) if val1 != 0 else 0 | |
| if diff > 1: # Improved | |
| comparison['improvement'][metric] = { | |
| 'old': val1, | |
| 'new': val2, | |
| 'change': diff, | |
| 'percent_change': percent_change | |
| } | |
| elif diff < -1: # Declined | |
| comparison['decline'][metric] = { | |
| 'old': val1, | |
| 'new': val2, | |
| 'change': diff, | |
| 'percent_change': percent_change | |
| } | |
| else: # Stable | |
| comparison['stable'][metric] = { | |
| 'old': val1, | |
| 'new': val2, | |
| 'change': diff | |
| } | |
| return comparison | |
| def get_category_details(self, category: str) -> Optional[Dict[str, Any]]: | |
| """ | |
| Get detailed analysis for a specific category. | |
| Args: | |
| category: Category name | |
| Returns: | |
| Category details or None if not found | |
| """ | |
| if not self.gaps: | |
| self.analyze_gaps() | |
| for gap in self.gaps: | |
| if gap['category'] == category: | |
| return gap | |
| return None | |