LaunchLLM / evaluation /evaluator.py
Bmccloud22's picture
Deploy LaunchLLM - Production AI Training Platform
ec8f374 verified
"""
Model Evaluator Module
Provides model evaluation and inference capabilities.
"""
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Any
import json
import time
from pathlib import Path
import torch
@dataclass
class EvaluationConfig:
"""
Configuration for model evaluation.
"""
model_name: str = "gpt2"
model_path: Optional[str] = None
max_length: int = 512
temperature: float = 0.7
top_p: float = 0.9
top_k: int = 50
num_beams: int = 1
do_sample: bool = True
batch_size: int = 1
device: str = "auto"
max_samples: Optional[int] = None
save_results: bool = True
output_dir: str = "evaluation_results"
include_metrics: bool = True
include_timings: bool = True
class ModelEvaluator:
"""
Model evaluator for testing and benchmarking LLMs.
Features:
- Inference on test datasets
- Batch processing
- Performance metrics
- Result saving and analysis
"""
def __init__(
self,
config: Optional[EvaluationConfig] = None,
model: Optional[Any] = None,
tokenizer: Optional[Any] = None
):
"""
Initialize evaluator.
Args:
config: Evaluation configuration
model: Pre-loaded model (optional)
tokenizer: Pre-loaded tokenizer (optional)
"""
self.config = config or EvaluationConfig()
self.model = model
self.tokenizer = tokenizer
self.results = []
self.metrics = {}
def load_model(self):
"""Load model and tokenizer."""
if self.model is not None and self.tokenizer is not None:
print("Using pre-loaded model and tokenizer")
return
try:
from transformers import AutoModelForCausalLM, AutoTokenizer
print(f"Loading model: {self.config.model_name}")
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.config.model_path or self.config.model_name,
trust_remote_code=True
)
# Ensure pad token exists
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# Load model
device_map = "auto" if self.config.device == "auto" else None
self.model = AutoModelForCausalLM.from_pretrained(
self.config.model_path or self.config.model_name,
device_map=device_map,
trust_remote_code=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
self.model.eval()
print("Model loaded successfully")
except Exception as e:
print(f"Error loading model: {e}")
raise
def generate_response(
self,
prompt: str,
max_length: Optional[int] = None,
temperature: Optional[float] = None
) -> str:
"""
Generate response for a single prompt.
Args:
prompt: Input prompt
max_length: Max generation length
temperature: Sampling temperature
Returns:
Generated text
"""
if self.model is None or self.tokenizer is None:
self.load_model()
max_length = max_length or self.config.max_length
temperature = temperature or self.config.temperature
try:
# Tokenize
inputs = self.tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=2048
)
# Move to device
if hasattr(self.model, 'device'):
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
# Generate
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=max_length,
temperature=temperature,
top_p=self.config.top_p,
top_k=self.config.top_k,
num_beams=self.config.num_beams,
do_sample=self.config.do_sample,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id
)
# Decode
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Remove prompt from output
if generated_text.startswith(prompt):
generated_text = generated_text[len(prompt):].strip()
return generated_text
except Exception as e:
print(f"Generation error: {e}")
return f"[Error: {str(e)}]"
def evaluate_dataset(
self,
dataset: List[Dict[str, str]],
max_samples: Optional[int] = None
) -> Dict[str, Any]:
"""
Evaluate model on a dataset.
Args:
dataset: List of examples with 'instruction', 'input', 'output'
max_samples: Maximum samples to evaluate
Returns:
Evaluation results
"""
if self.model is None:
self.load_model()
max_samples = max_samples or self.config.max_samples or len(dataset)
dataset = dataset[:max_samples]
print(f"\nEvaluating on {len(dataset)} examples...")
results = []
predictions = []
references = []
start_time = time.time()
for i, example in enumerate(dataset):
# Build prompt
instruction = example.get('instruction', '')
input_text = example.get('input', '')
reference = example.get('output', '')
if input_text:
prompt = f"{instruction}\n\nInput: {input_text}\n\nResponse:"
else:
prompt = f"{instruction}\n\nResponse:"
# Generate
example_start = time.time()
prediction = self.generate_response(prompt)
example_time = time.time() - example_start
# Store results
result = {
'index': i,
'instruction': instruction,
'input': input_text,
'reference': reference,
'prediction': prediction,
'generation_time': example_time
}
results.append(result)
predictions.append(prediction)
references.append(reference)
if (i + 1) % 10 == 0:
print(f" Processed {i + 1}/{len(dataset)} examples...")
total_time = time.time() - start_time
# Calculate metrics
metrics = {}
if self.config.include_metrics:
try:
from .metrics import Metrics
metrics_calc = Metrics()
metrics = metrics_calc.calculate_all_metrics(predictions, references)
except Exception as e:
print(f"Metrics calculation error: {e}")
metrics = {'error': str(e)}
# Compile results
evaluation_results = {
'config': {
'model_name': self.config.model_name,
'model_path': self.config.model_path,
'max_length': self.config.max_length,
'temperature': self.config.temperature,
'num_samples': len(dataset)
},
'metrics': metrics,
'timing': {
'total_time': total_time,
'avg_time_per_example': total_time / len(dataset),
'throughput': len(dataset) / total_time
},
'examples': results
}
# Save results
if self.config.save_results:
self.save_results(evaluation_results)
print(f"\n✅ Evaluation complete!")
print(f"Total time: {total_time:.2f}s")
print(f"Avg time per example: {total_time/len(dataset):.2f}s")
if metrics:
print(f"\nMetrics:")
for key, value in metrics.items():
if isinstance(value, (int, float)):
print(f" {key}: {value:.2f}")
return evaluation_results
def save_results(self, results: Dict[str, Any], filename: Optional[str] = None):
"""
Save evaluation results to JSON.
Args:
results: Evaluation results
filename: Output filename
"""
output_dir = Path(self.config.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
if filename is None:
timestamp = time.strftime('%Y%m%d_%H%M%S')
filename = f"evaluation_{timestamp}.json"
filepath = output_dir / filename
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"Results saved to: {filepath}")
def load_results(self, filepath: str) -> Dict[str, Any]:
"""
Load evaluation results from JSON.
Args:
filepath: Path to results file
Returns:
Loaded results
"""
with open(filepath, 'r', encoding='utf-8') as f:
results = json.load(f)
return results
def compare_results(self, results_list: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Compare multiple evaluation results.
Args:
results_list: List of evaluation results
Returns:
Comparison summary
"""
comparison = {
'num_evaluations': len(results_list),
'models': [r['config']['model_name'] for r in results_list],
'metrics_comparison': {}
}
# Extract metrics
all_metrics = {}
for i, results in enumerate(results_list):
model_name = results['config']['model_name']
all_metrics[model_name] = results.get('metrics', {})
# Compare each metric
metric_names = set()
for metrics in all_metrics.values():
metric_names.update(metrics.keys())
for metric in metric_names:
values = {}
for model, metrics in all_metrics.items():
if metric in metrics:
values[model] = metrics[metric]
if values:
comparison['metrics_comparison'][metric] = values
return comparison