Spaces:
Runtime error
Runtime error
| """ | |
| Benchmark Builder Module | |
| Provides interactive tools for creating custom benchmarks. | |
| """ | |
| from dataclasses import dataclass, field | |
| from typing import List, Dict, Optional, Any, Callable | |
| import json | |
| import time | |
| from pathlib import Path | |
| class Benchmark: | |
| """ | |
| A single benchmark test (builder variant). | |
| This is a builder-specific implementation with enhanced | |
| interactive creation features. | |
| """ | |
| name: str | |
| description: str = "" | |
| questions: List[Dict[str, Any]] = field(default_factory=list) | |
| metadata: Dict[str, Any] = field(default_factory=dict) | |
| created_at: Optional[str] = None | |
| domain: str = "general" | |
| difficulty: str = "mixed" | |
| passing_score: float = 70.0 | |
| categories: List[str] = field(default_factory=list) | |
| tags: List[str] = field(default_factory=list) | |
| def __post_init__(self): | |
| """Initialize timestamp if not provided.""" | |
| if self.created_at is None: | |
| self.created_at = time.strftime('%Y-%m-%d %H:%M:%S') | |
| def add_question( | |
| self, | |
| question: str, | |
| answer: str, | |
| category: Optional[str] = None, | |
| difficulty: Optional[str] = None, | |
| metadata: Optional[Dict] = None, | |
| explanation: Optional[str] = None, | |
| points: float = 1.0 | |
| ): | |
| """ | |
| Add a question to the benchmark. | |
| Args: | |
| question: Question text | |
| answer: Expected answer | |
| category: Question category/topic | |
| difficulty: Difficulty level | |
| metadata: Additional metadata | |
| explanation: Answer explanation | |
| points: Points for this question | |
| """ | |
| question_dict = { | |
| 'id': len(self.questions), | |
| 'question': question, | |
| 'answer': answer, | |
| 'category': category or 'general', | |
| 'difficulty': difficulty or 'intermediate', | |
| 'explanation': explanation or '', | |
| 'points': points, | |
| 'metadata': metadata or {} | |
| } | |
| self.questions.append(question_dict) | |
| # Update categories list | |
| if category and category not in self.categories: | |
| self.categories.append(category) | |
| def add_multiple_choice_question( | |
| self, | |
| question: str, | |
| choices: List[str], | |
| correct_answer: str, | |
| category: Optional[str] = None, | |
| difficulty: Optional[str] = None, | |
| explanation: Optional[str] = None | |
| ): | |
| """ | |
| Add a multiple choice question. | |
| Args: | |
| question: Question text | |
| choices: List of answer choices | |
| correct_answer: The correct answer | |
| category: Question category | |
| difficulty: Difficulty level | |
| explanation: Answer explanation | |
| """ | |
| self.add_question( | |
| question=question, | |
| answer=correct_answer, | |
| category=category, | |
| difficulty=difficulty, | |
| explanation=explanation, | |
| metadata={ | |
| 'type': 'multiple_choice', | |
| 'choices': choices | |
| } | |
| ) | |
| def import_from_json(self, filepath: str): | |
| """ | |
| Import questions from JSON file. | |
| Args: | |
| filepath: Path to JSON file | |
| """ | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| # Handle different JSON formats | |
| if isinstance(data, list): | |
| # List of questions | |
| for item in data: | |
| self.add_question( | |
| question=item.get('question', ''), | |
| answer=item.get('answer', ''), | |
| category=item.get('category'), | |
| difficulty=item.get('difficulty'), | |
| metadata=item.get('metadata', {}) | |
| ) | |
| elif isinstance(data, dict): | |
| # Benchmark format | |
| if 'questions' in data: | |
| for item in data['questions']: | |
| self.add_question( | |
| question=item.get('question', ''), | |
| answer=item.get('answer', ''), | |
| category=item.get('category'), | |
| difficulty=item.get('difficulty'), | |
| metadata=item.get('metadata', {}) | |
| ) | |
| print(f"Imported {len(self.questions)} questions from {filepath}") | |
| def import_from_csv(self, filepath: str, delimiter: str = ','): | |
| """ | |
| Import questions from CSV file. | |
| Expected columns: question, answer, category, difficulty | |
| Args: | |
| filepath: Path to CSV file | |
| delimiter: CSV delimiter | |
| """ | |
| import csv | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| reader = csv.DictReader(f, delimiter=delimiter) | |
| for row in reader: | |
| self.add_question( | |
| question=row.get('question', ''), | |
| answer=row.get('answer', ''), | |
| category=row.get('category'), | |
| difficulty=row.get('difficulty') | |
| ) | |
| print(f"Imported {len(self.questions)} questions from CSV") | |
| def get_statistics(self) -> Dict[str, Any]: | |
| """Get benchmark statistics.""" | |
| stats = { | |
| 'total_questions': len(self.questions), | |
| 'categories': {}, | |
| 'difficulties': {}, | |
| 'avg_question_length': 0, | |
| 'avg_answer_length': 0 | |
| } | |
| # Count by category | |
| for q in self.questions: | |
| cat = q.get('category', 'uncategorized') | |
| stats['categories'][cat] = stats['categories'].get(cat, 0) + 1 | |
| diff = q.get('difficulty', 'unknown') | |
| stats['difficulties'][diff] = stats['difficulties'].get(diff, 0) + 1 | |
| # Calculate averages | |
| if self.questions: | |
| total_q_len = sum(len(q['question']) for q in self.questions) | |
| total_a_len = sum(len(q['answer']) for q in self.questions) | |
| stats['avg_question_length'] = total_q_len / len(self.questions) | |
| stats['avg_answer_length'] = total_a_len / len(self.questions) | |
| return stats | |
| def validate(self) -> List[str]: | |
| """ | |
| Validate benchmark and return list of issues. | |
| Returns: | |
| List of validation issues (empty if valid) | |
| """ | |
| issues = [] | |
| if not self.name: | |
| issues.append("Benchmark name is required") | |
| if not self.questions: | |
| issues.append("Benchmark has no questions") | |
| for i, q in enumerate(self.questions): | |
| if not q.get('question'): | |
| issues.append(f"Question {i} is missing question text") | |
| if not q.get('answer'): | |
| issues.append(f"Question {i} is missing answer") | |
| return issues | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Convert benchmark to dictionary.""" | |
| return { | |
| 'name': self.name, | |
| 'description': self.description, | |
| 'domain': self.domain, | |
| 'difficulty': self.difficulty, | |
| 'passing_score': self.passing_score, | |
| 'created_at': self.created_at, | |
| 'categories': self.categories, | |
| 'tags': self.tags, | |
| 'num_questions': len(self.questions), | |
| 'questions': self.questions, | |
| 'metadata': self.metadata, | |
| 'statistics': self.get_statistics() | |
| } | |
| def from_dict(cls, data: Dict[str, Any]) -> 'Benchmark': | |
| """Create benchmark from dictionary.""" | |
| return cls( | |
| name=data.get('name', 'Untitled'), | |
| description=data.get('description', ''), | |
| questions=data.get('questions', []), | |
| metadata=data.get('metadata', {}), | |
| created_at=data.get('created_at'), | |
| domain=data.get('domain', 'general'), | |
| difficulty=data.get('difficulty', 'mixed'), | |
| passing_score=data.get('passing_score', 70.0), | |
| categories=data.get('categories', []), | |
| tags=data.get('tags', []) | |
| ) | |
| def save(self, filepath: str): | |
| """Save benchmark to JSON file.""" | |
| Path(filepath).parent.mkdir(parents=True, exist_ok=True) | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(self.to_dict(), f, indent=2, ensure_ascii=False) | |
| print(f"Benchmark saved to: {filepath}") | |
| def load(cls, filepath: str) -> 'Benchmark': | |
| """Load benchmark from JSON file.""" | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| return cls.from_dict(data) | |
| class BenchmarkSuite: | |
| """ | |
| Collection of benchmarks with enhanced building features. | |
| """ | |
| def __init__(self, name: str = "Default Suite"): | |
| """ | |
| Initialize benchmark suite. | |
| Args: | |
| name: Suite name | |
| """ | |
| self.name = name | |
| self.description = "" | |
| self.benchmarks: List[Benchmark] = [] | |
| self.metadata: Dict[str, Any] = {} | |
| def add_benchmark(self, benchmark: Benchmark): | |
| """Add a benchmark to the suite.""" | |
| self.benchmarks.append(benchmark) | |
| print(f"Added benchmark: {benchmark.name}") | |
| def create_benchmark( | |
| self, | |
| name: str, | |
| description: str = "", | |
| domain: str = "general", | |
| passing_score: float = 70.0 | |
| ) -> Benchmark: | |
| """ | |
| Create a new benchmark and add to suite. | |
| Args: | |
| name: Benchmark name | |
| description: Benchmark description | |
| domain: Domain/topic | |
| passing_score: Passing score percentage | |
| Returns: | |
| Created benchmark | |
| """ | |
| benchmark = Benchmark( | |
| name=name, | |
| description=description, | |
| domain=domain, | |
| passing_score=passing_score | |
| ) | |
| self.add_benchmark(benchmark) | |
| return benchmark | |
| def get_benchmark(self, name: str) -> Optional[Benchmark]: | |
| """Get a benchmark by name.""" | |
| for benchmark in self.benchmarks: | |
| if benchmark.name == name: | |
| return benchmark | |
| return None | |
| def remove_benchmark(self, benchmark_name: str): | |
| """Remove a benchmark by name.""" | |
| self.benchmarks = [b for b in self.benchmarks if b.name != benchmark_name] | |
| def list_benchmarks(self) -> List[str]: | |
| """Get list of benchmark names.""" | |
| return [b.name for b in self.benchmarks] | |
| def get_statistics(self) -> Dict[str, Any]: | |
| """Get suite-wide statistics.""" | |
| total_questions = sum(len(b.questions) for b in self.benchmarks) | |
| stats = { | |
| 'num_benchmarks': len(self.benchmarks), | |
| 'total_questions': total_questions, | |
| 'benchmarks': [] | |
| } | |
| for benchmark in self.benchmarks: | |
| stats['benchmarks'].append({ | |
| 'name': benchmark.name, | |
| 'num_questions': len(benchmark.questions), | |
| 'domain': benchmark.domain, | |
| 'passing_score': benchmark.passing_score | |
| }) | |
| return stats | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Convert suite to dictionary.""" | |
| return { | |
| 'name': self.name, | |
| 'description': self.description, | |
| 'num_benchmarks': len(self.benchmarks), | |
| 'benchmarks': [b.to_dict() for b in self.benchmarks], | |
| 'metadata': self.metadata, | |
| 'statistics': self.get_statistics() | |
| } | |
| def from_dict(cls, data: Dict[str, Any]) -> 'BenchmarkSuite': | |
| """Create suite from dictionary.""" | |
| suite = cls(name=data.get('name', 'Default Suite')) | |
| suite.description = data.get('description', '') | |
| suite.metadata = data.get('metadata', {}) | |
| for benchmark_data in data.get('benchmarks', []): | |
| benchmark = Benchmark.from_dict(benchmark_data) | |
| suite.add_benchmark(benchmark) | |
| return suite | |
| def save(self, filepath: str): | |
| """Save suite to JSON file.""" | |
| Path(filepath).parent.mkdir(parents=True, exist_ok=True) | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(self.to_dict(), f, indent=2, ensure_ascii=False) | |
| print(f"Benchmark suite saved to: {filepath}") | |
| def load(cls, filepath: str) -> 'BenchmarkSuite': | |
| """Load suite from JSON file.""" | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| return cls.from_dict(data) | |
| class InteractiveBenchmarkBuilder: | |
| """ | |
| Interactive builder for creating benchmarks through UI/CLI. | |
| """ | |
| def __init__(self): | |
| """Initialize builder.""" | |
| self.current_benchmark: Optional[Benchmark] = None | |
| self.current_suite: Optional[BenchmarkSuite] = None | |
| def create_benchmark( | |
| self, | |
| name: str, | |
| description: str = "", | |
| domain: str = "general" | |
| ) -> Benchmark: | |
| """ | |
| Create a new benchmark. | |
| Args: | |
| name: Benchmark name | |
| description: Description | |
| domain: Domain/topic | |
| Returns: | |
| Created benchmark | |
| """ | |
| self.current_benchmark = Benchmark( | |
| name=name, | |
| description=description, | |
| domain=domain | |
| ) | |
| return self.current_benchmark | |
| def add_question_interactive( | |
| self, | |
| question: str, | |
| answer: str, | |
| category: Optional[str] = None, | |
| difficulty: Optional[str] = None | |
| ) -> bool: | |
| """ | |
| Add question to current benchmark. | |
| Args: | |
| question: Question text | |
| answer: Answer text | |
| category: Category | |
| difficulty: Difficulty level | |
| Returns: | |
| Success status | |
| """ | |
| if not self.current_benchmark: | |
| print("No active benchmark. Create one first.") | |
| return False | |
| self.current_benchmark.add_question( | |
| question=question, | |
| answer=answer, | |
| category=category, | |
| difficulty=difficulty | |
| ) | |
| return True | |
| def preview_benchmark(self) -> str: | |
| """Preview current benchmark.""" | |
| if not self.current_benchmark: | |
| return "No active benchmark" | |
| stats = self.current_benchmark.get_statistics() | |
| preview = f""" | |
| Benchmark: {self.current_benchmark.name} | |
| Description: {self.current_benchmark.description} | |
| Domain: {self.current_benchmark.domain} | |
| Total Questions: {stats['total_questions']} | |
| Categories: | |
| """ | |
| for cat, count in stats['categories'].items(): | |
| preview += f" - {cat}: {count} questions\n" | |
| return preview | |
| def finalize_benchmark(self, filepath: Optional[str] = None) -> Benchmark: | |
| """ | |
| Finalize and optionally save benchmark. | |
| Args: | |
| filepath: Optional save path | |
| Returns: | |
| Finalized benchmark | |
| """ | |
| if not self.current_benchmark: | |
| raise ValueError("No active benchmark to finalize") | |
| issues = self.current_benchmark.validate() | |
| if issues: | |
| print("Validation warnings:") | |
| for issue in issues: | |
| print(f" - {issue}") | |
| if filepath: | |
| self.current_benchmark.save(filepath) | |
| benchmark = self.current_benchmark | |
| self.current_benchmark = None | |
| return benchmark | |