""" Evaluation script for trained model with comprehensive analysis """ import argparse import sys import os import numpy as np import pandas as pd from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer # Add parent directory to path sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from src import ( load_config, compute_metrics_factory, plot_confusion_matrix, print_classification_report ) from src.data_loader import prepare_datasets_for_training def analyze_errors( test_dataset, predictions: np.ndarray, labels: np.ndarray, id2label: dict, tokenizer, top_n: int = 10 ) -> pd.DataFrame: """ Analyze misclassified examples. Args: test_dataset: Test dataset predictions: Predicted labels labels: True labels id2label: Label mapping tokenizer: Tokenizer to decode text top_n: Number of examples to show per error type Returns: DataFrame with error analysis """ errors = [] for i, (pred, true_label) in enumerate(zip(predictions, labels)): if pred != true_label: # Decode the comment (approximate, as original text is removed) # Note: This is a limitation - we'd need to keep original text errors.append({ 'index': i, 'true_label': id2label[true_label], 'predicted_label': id2label[pred], 'error_type': f"{id2label[true_label]} -> {id2label[pred]}" }) error_df = pd.DataFrame(errors) if len(error_df) > 0: print(f"\nError Analysis:") print(f"Total errors: {len(error_df)}") print(f"\nError type distribution:") print(error_df['error_type'].value_counts()) return error_df def evaluate_model( model_path: str, config_path: str = "config.yaml", save_plots: bool = True ): """ Evaluate trained model on test set with comprehensive analysis. Args: model_path: Path to the trained model config_path: Path to configuration file save_plots: Whether to save visualization plots """ print("=" * 60) print("Model Evaluation") print("=" * 60) # Load config config = load_config(config_path) # Create output directory output_dir = config['training'].get('output_dir', './results') os.makedirs(output_dir, exist_ok=True) # Load datasets print("\n[1/5] Loading datasets...") tokenized_datasets, label2id, id2label, _ = prepare_datasets_for_training(config_path) test_dataset = tokenized_datasets['test'] print(f"✓ Test samples: {len(test_dataset)}") # Load model and tokenizer print("\n[2/5] Loading trained model...") tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForSequenceClassification.from_pretrained(model_path) print(f"✓ Model loaded from {model_path}") # Create trainer for evaluation print("\n[3/5] Running evaluation...") compute_metrics_fn = compute_metrics_factory(id2label) trainer = Trainer( model=model, tokenizer=tokenizer, compute_metrics=compute_metrics_fn ) # Get predictions predictions_output = trainer.predict(test_dataset) predictions = np.argmax(predictions_output.predictions, axis=1) labels = predictions_output.label_ids # Print metrics print("\n[4/5] Computing detailed metrics...") print("\n" + "=" * 60) print("Test Set Results") print("=" * 60) metrics = predictions_output.metrics # Overall metrics print("\nOverall Metrics:") overall_metrics = ['accuracy', 'f1_weighted', 'f1_macro', 'precision_weighted', 'recall_weighted'] for metric in overall_metrics: key = f'test_{metric}' if key in metrics: print(f" {metric.replace('_', ' ').title()}: {metrics[key]:.4f}") # Per-class metrics print("\nPer-Class Metrics:") label_names = [id2label[i] for i in range(len(id2label))] for label_name in label_names: precision_key = f'test_precision_{label_name}' recall_key = f'test_recall_{label_name}' f1_key = f'test_f1_{label_name}' if precision_key in metrics: print(f"\n {label_name.upper()}:") print(f" Precision: {metrics[precision_key]:.4f}") print(f" Recall: {metrics[recall_key]:.4f}") print(f" F1-Score: {metrics[f1_key]:.4f}") print(f" Support: {metrics.get(f'test_support_{label_name}', 'N/A')}") # Detailed classification report print("\n" + "=" * 60) print_classification_report(labels, predictions, label_names) # Plot confusion matrix print("\n[5/5] Generating visualizations...") if save_plots: plot_confusion_matrix( labels, predictions, label_names, save_path=os.path.join(output_dir, "confusion_matrix.png"), normalize=False ) # Also save normalized version plot_confusion_matrix( labels, predictions, label_names, save_path=os.path.join(output_dir, "confusion_matrix_normalized.png"), normalize=True ) # Error analysis error_df = analyze_errors(test_dataset, predictions, labels, id2label, tokenizer) if len(error_df) > 0 and save_plots: error_path = os.path.join(output_dir, "error_analysis.csv") error_df.to_csv(error_path, index=False) print(f"✓ Error analysis saved to {error_path}") print("\n" + "=" * 60) print("Evaluation Complete! 🎉") print("=" * 60) print(f"\nResults saved to: {output_dir}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Evaluate trained model") parser.add_argument( "--model-path", type=str, default="./results/final_model", help="Path to the trained model" ) parser.add_argument( "--config", type=str, default="config.yaml", help="Path to configuration file" ) parser.add_argument( "--no-plots", action="store_true", help="Skip generating visualization plots" ) args = parser.parse_args() evaluate_model(args.model_path, args.config, save_plots=not args.no_plots)