Spaces:
Runtime error
Runtime error
| """ | |
| Benchmark Module | |
| Provides benchmark creation and execution for model testing. | |
| """ | |
| from dataclasses import dataclass, field | |
| from typing import List, Dict, Optional, Any | |
| import json | |
| import time | |
| from pathlib import Path | |
| class Benchmark: | |
| """ | |
| A single benchmark test. | |
| Attributes: | |
| name: Benchmark name | |
| description: Benchmark description | |
| questions: List of test questions | |
| metadata: Additional metadata | |
| """ | |
| name: str | |
| description: str = "" | |
| questions: List[Dict[str, Any]] = field(default_factory=list) | |
| metadata: Dict[str, Any] = field(default_factory=dict) | |
| created_at: Optional[str] = None | |
| domain: str = "general" | |
| difficulty: str = "mixed" | |
| passing_score: float = 70.0 | |
| def __post_init__(self): | |
| """Initialize timestamp if not provided.""" | |
| if self.created_at is None: | |
| self.created_at = time.strftime('%Y-%m-%d %H:%M:%S') | |
| def add_question( | |
| self, | |
| question: str, | |
| answer: str, | |
| category: Optional[str] = None, | |
| difficulty: Optional[str] = None, | |
| metadata: Optional[Dict] = None | |
| ): | |
| """ | |
| Add a question to the benchmark. | |
| Args: | |
| question: Question text | |
| answer: Expected answer | |
| category: Question category/topic | |
| difficulty: Difficulty level | |
| metadata: Additional metadata | |
| """ | |
| question_dict = { | |
| 'question': question, | |
| 'answer': answer, | |
| 'category': category or 'general', | |
| 'difficulty': difficulty or 'intermediate', | |
| 'metadata': metadata or {} | |
| } | |
| self.questions.append(question_dict) | |
| def get_questions_by_category(self, category: str) -> List[Dict]: | |
| """Get all questions in a category.""" | |
| return [q for q in self.questions if q.get('category') == category] | |
| def get_questions_by_difficulty(self, difficulty: str) -> List[Dict]: | |
| """Get all questions of a difficulty level.""" | |
| return [q for q in self.questions if q.get('difficulty') == difficulty] | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Convert benchmark to dictionary.""" | |
| return { | |
| 'name': self.name, | |
| 'description': self.description, | |
| 'domain': self.domain, | |
| 'difficulty': self.difficulty, | |
| 'passing_score': self.passing_score, | |
| 'created_at': self.created_at, | |
| 'num_questions': len(self.questions), | |
| 'questions': self.questions, | |
| 'metadata': self.metadata | |
| } | |
| def from_dict(cls, data: Dict[str, Any]) -> 'Benchmark': | |
| """Create benchmark from dictionary.""" | |
| return cls( | |
| name=data.get('name', 'Untitled'), | |
| description=data.get('description', ''), | |
| questions=data.get('questions', []), | |
| metadata=data.get('metadata', {}), | |
| created_at=data.get('created_at'), | |
| domain=data.get('domain', 'general'), | |
| difficulty=data.get('difficulty', 'mixed'), | |
| passing_score=data.get('passing_score', 70.0) | |
| ) | |
| def save(self, filepath: str): | |
| """Save benchmark to JSON file.""" | |
| Path(filepath).parent.mkdir(parents=True, exist_ok=True) | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(self.to_dict(), f, indent=2, ensure_ascii=False) | |
| print(f"Benchmark saved to: {filepath}") | |
| def load(cls, filepath: str) -> 'Benchmark': | |
| """Load benchmark from JSON file.""" | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| return cls.from_dict(data) | |
| class BenchmarkSuite: | |
| """ | |
| Collection of benchmarks for comprehensive testing. | |
| Features: | |
| - Multiple benchmark management | |
| - Batch execution | |
| - Aggregate scoring | |
| - Result tracking | |
| """ | |
| def __init__(self, name: str = "Default Suite"): | |
| """ | |
| Initialize benchmark suite. | |
| Args: | |
| name: Suite name | |
| """ | |
| self.name = name | |
| self.benchmarks: List[Benchmark] = [] | |
| self.results: List[Dict[str, Any]] = [] | |
| def add_benchmark(self, benchmark: Benchmark): | |
| """ | |
| Add a benchmark to the suite. | |
| Args: | |
| benchmark: Benchmark to add | |
| """ | |
| self.benchmarks.append(benchmark) | |
| print(f"Added benchmark: {benchmark.name}") | |
| def remove_benchmark(self, benchmark_name: str): | |
| """ | |
| Remove a benchmark by name. | |
| Args: | |
| benchmark_name: Name of benchmark to remove | |
| """ | |
| self.benchmarks = [b for b in self.benchmarks if b.name != benchmark_name] | |
| def get_benchmark(self, name: str) -> Optional[Benchmark]: | |
| """ | |
| Get a benchmark by name. | |
| Args: | |
| name: Benchmark name | |
| Returns: | |
| Benchmark if found, None otherwise | |
| """ | |
| for benchmark in self.benchmarks: | |
| if benchmark.name == name: | |
| return benchmark | |
| return None | |
| def run_benchmark( | |
| self, | |
| benchmark: Benchmark, | |
| model_evaluator: Any, | |
| max_questions: Optional[int] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Run a single benchmark. | |
| Args: | |
| benchmark: Benchmark to run | |
| model_evaluator: Model evaluator instance | |
| max_questions: Maximum questions to test | |
| Returns: | |
| Benchmark results | |
| """ | |
| print(f"\nRunning benchmark: {benchmark.name}") | |
| print(f"Total questions: {len(benchmark.questions)}") | |
| questions = benchmark.questions[:max_questions] if max_questions else benchmark.questions | |
| # Convert to dataset format | |
| dataset = [] | |
| for q in questions: | |
| dataset.append({ | |
| 'instruction': q['question'], | |
| 'input': '', | |
| 'output': q['answer'] | |
| }) | |
| # Run evaluation | |
| start_time = time.time() | |
| eval_results = model_evaluator.evaluate_dataset(dataset) | |
| total_time = time.time() - start_time | |
| # Calculate score | |
| score = self._calculate_score(eval_results) | |
| # Compile results | |
| results = { | |
| 'benchmark_name': benchmark.name, | |
| 'num_questions': len(questions), | |
| 'score': score, | |
| 'passed': score >= benchmark.passing_score, | |
| 'passing_score': benchmark.passing_score, | |
| 'total_time': total_time, | |
| 'evaluation_results': eval_results, | |
| 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S') | |
| } | |
| self.results.append(results) | |
| print(f"\n{'='*60}") | |
| print(f"Benchmark: {benchmark.name}") | |
| print(f"Score: {score:.2f}% (Passing: {benchmark.passing_score}%)") | |
| print(f"Status: {'β PASSED' if results['passed'] else 'β FAILED'}") | |
| print(f"{'='*60}\n") | |
| return results | |
| def run_all_benchmarks( | |
| self, | |
| model_evaluator: Any, | |
| max_questions_per_benchmark: Optional[int] = None | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Run all benchmarks in the suite. | |
| Args: | |
| model_evaluator: Model evaluator instance | |
| max_questions_per_benchmark: Max questions per benchmark | |
| Returns: | |
| List of all results | |
| """ | |
| print(f"\n{'='*60}") | |
| print(f"Running Benchmark Suite: {self.name}") | |
| print(f"Total Benchmarks: {len(self.benchmarks)}") | |
| print(f"{'='*60}\n") | |
| all_results = [] | |
| for benchmark in self.benchmarks: | |
| results = self.run_benchmark( | |
| benchmark, | |
| model_evaluator, | |
| max_questions_per_benchmark | |
| ) | |
| all_results.append(results) | |
| # Summary | |
| self._print_summary(all_results) | |
| return all_results | |
| def _calculate_score(self, eval_results: Dict[str, Any]) -> float: | |
| """ | |
| Calculate benchmark score from evaluation results. | |
| Args: | |
| eval_results: Evaluation results | |
| Returns: | |
| Score percentage | |
| """ | |
| metrics = eval_results.get('metrics', {}) | |
| # Use available metrics (prioritize accuracy, then BLEU, then ROUGE) | |
| if 'accuracy' in metrics: | |
| return metrics['accuracy'] | |
| elif 'bleu' in metrics: | |
| return metrics['bleu'] | |
| elif 'rouge_l_f1' in metrics: | |
| return metrics['rouge_l_f1'] | |
| else: | |
| # Fallback: simple similarity check | |
| examples = eval_results.get('examples', []) | |
| if not examples: | |
| return 0.0 | |
| matches = 0 | |
| for ex in examples: | |
| pred = ex.get('prediction', '').lower().strip() | |
| ref = ex.get('reference', '').lower().strip() | |
| if pred in ref or ref in pred: | |
| matches += 1 | |
| return (matches / len(examples)) * 100.0 | |
| def _print_summary(self, results: List[Dict[str, Any]]): | |
| """Print summary of all benchmark results.""" | |
| print(f"\n{'='*60}") | |
| print(f"BENCHMARK SUITE SUMMARY: {self.name}") | |
| print(f"{'='*60}") | |
| total_benchmarks = len(results) | |
| passed = sum(1 for r in results if r['passed']) | |
| print(f"\nOverall: {passed}/{total_benchmarks} benchmarks passed") | |
| print(f"\nIndividual Results:") | |
| for result in results: | |
| status = 'β PASS' if result['passed'] else 'β FAIL' | |
| print(f" {status} | {result['benchmark_name']:40s} | Score: {result['score']:6.2f}%") | |
| avg_score = sum(r['score'] for r in results) / len(results) if results else 0 | |
| print(f"\nAverage Score: {avg_score:.2f}%") | |
| print(f"{'='*60}\n") | |
| def save_results(self, filepath: str): | |
| """ | |
| Save suite results to JSON. | |
| Args: | |
| filepath: Output file path | |
| """ | |
| Path(filepath).parent.mkdir(parents=True, exist_ok=True) | |
| data = { | |
| 'suite_name': self.name, | |
| 'num_benchmarks': len(self.benchmarks), | |
| 'benchmark_names': [b.name for b in self.benchmarks], | |
| 'results': self.results, | |
| 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S') | |
| } | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| print(f"Suite results saved to: {filepath}") | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Convert suite to dictionary.""" | |
| return { | |
| 'name': self.name, | |
| 'num_benchmarks': len(self.benchmarks), | |
| 'benchmarks': [b.to_dict() for b in self.benchmarks], | |
| 'results': self.results | |
| } | |
| def from_dict(cls, data: Dict[str, Any]) -> 'BenchmarkSuite': | |
| """Create suite from dictionary.""" | |
| suite = cls(name=data.get('name', 'Default Suite')) | |
| for benchmark_data in data.get('benchmarks', []): | |
| benchmark = Benchmark.from_dict(benchmark_data) | |
| suite.add_benchmark(benchmark) | |
| suite.results = data.get('results', []) | |
| return suite | |