""" Metrics Module Provides various evaluation metrics for model performance assessment. """ import numpy as np from typing import List, Dict, Optional, Union import math class Metrics: """ Comprehensive metrics calculator for model evaluation. Supports: - BLEU score - ROUGE-L - Perplexity - Custom financial domain metrics """ def __init__(self): """Initialize metrics calculator.""" self.results = {} def calculate_bleu( self, references: List[str], hypotheses: List[str], max_n: int = 4 ) -> float: """ Calculate BLEU score. Args: references: Reference texts hypotheses: Generated texts max_n: Maximum n-gram size Returns: BLEU score (0-100) """ try: from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction # Tokenize ref_tokens = [[ref.split()] for ref in references] hyp_tokens = [hyp.split() for hyp in hypotheses] # Calculate with smoothing smoothing = SmoothingFunction() score = corpus_bleu( ref_tokens, hyp_tokens, smoothing_function=smoothing.method1 ) return score * 100.0 except ImportError: # Fallback: simple word overlap return self._simple_bleu(references, hypotheses) def _simple_bleu(self, references: List[str], hypotheses: List[str]) -> float: """Simple BLEU approximation without NLTK.""" total_overlap = 0 total_length = 0 for ref, hyp in zip(references, hypotheses): ref_words = set(ref.lower().split()) hyp_words = set(hyp.lower().split()) overlap = len(ref_words & hyp_words) total_overlap += overlap total_length += max(len(ref_words), len(hyp_words)) if total_length == 0: return 0.0 return (total_overlap / total_length) * 100.0 def calculate_rouge_l( self, references: List[str], hypotheses: List[str] ) -> Dict[str, float]: """ Calculate ROUGE-L score. Args: references: Reference texts hypotheses: Generated texts Returns: Dict with precision, recall, f1 """ total_precision = 0 total_recall = 0 total_f1 = 0 for ref, hyp in zip(references, hypotheses): ref_words = ref.split() hyp_words = hyp.split() # Find longest common subsequence lcs_length = self._lcs_length(ref_words, hyp_words) # Calculate metrics precision = lcs_length / len(hyp_words) if len(hyp_words) > 0 else 0 recall = lcs_length / len(ref_words) if len(ref_words) > 0 else 0 f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0 total_precision += precision total_recall += recall total_f1 += f1 n = len(references) return { 'precision': (total_precision / n) * 100.0 if n > 0 else 0.0, 'recall': (total_recall / n) * 100.0 if n > 0 else 0.0, 'f1': (total_f1 / n) * 100.0 if n > 0 else 0.0 } def _lcs_length(self, seq1: List[str], seq2: List[str]) -> int: """Calculate longest common subsequence length.""" m, n = len(seq1), len(seq2) dp = [[0] * (n + 1) for _ in range(m + 1)] for i in range(1, m + 1): for j in range(1, n + 1): if seq1[i-1] == seq2[j-1]: dp[i][j] = dp[i-1][j-1] + 1 else: dp[i][j] = max(dp[i-1][j], dp[i][j-1]) return dp[m][n] def calculate_perplexity( self, log_probs: List[float] ) -> float: """ Calculate perplexity from log probabilities. Args: log_probs: List of log probabilities Returns: Perplexity score """ if not log_probs: return float('inf') avg_log_prob = sum(log_probs) / len(log_probs) perplexity = math.exp(-avg_log_prob) return perplexity def calculate_accuracy( self, predictions: List[str], references: List[str] ) -> float: """ Calculate exact match accuracy. Args: predictions: Predicted answers references: Reference answers Returns: Accuracy percentage """ if not predictions or not references: return 0.0 matches = sum( pred.strip().lower() == ref.strip().lower() for pred, ref in zip(predictions, references) ) return (matches / len(predictions)) * 100.0 def calculate_all_metrics( self, predictions: List[str], references: List[str], log_probs: Optional[List[float]] = None ) -> Dict[str, float]: """ Calculate all available metrics. Args: predictions: Model predictions references: Reference answers log_probs: Optional log probabilities for perplexity Returns: Dict of all metrics """ metrics = {} # BLEU try: metrics['bleu'] = self.calculate_bleu(references, predictions) except Exception as e: print(f"BLEU calculation error: {e}") metrics['bleu'] = 0.0 # ROUGE-L try: rouge = self.calculate_rouge_l(references, predictions) metrics['rouge_l_precision'] = rouge['precision'] metrics['rouge_l_recall'] = rouge['recall'] metrics['rouge_l_f1'] = rouge['f1'] except Exception as e: print(f"ROUGE calculation error: {e}") metrics['rouge_l_f1'] = 0.0 # Accuracy try: metrics['accuracy'] = self.calculate_accuracy(predictions, references) except Exception as e: print(f"Accuracy calculation error: {e}") metrics['accuracy'] = 0.0 # Perplexity if log_probs: try: metrics['perplexity'] = self.calculate_perplexity(log_probs) except Exception as e: print(f"Perplexity calculation error: {e}") metrics['perplexity'] = float('inf') # Average response length metrics['avg_response_length'] = sum(len(p.split()) for p in predictions) / len(predictions) return metrics def calculate_perplexity(log_probs: List[float]) -> float: """ Standalone function to calculate perplexity. Args: log_probs: List of log probabilities Returns: Perplexity score """ metrics = Metrics() return metrics.calculate_perplexity(log_probs) def calculate_bleu(references: List[str], hypotheses: List[str]) -> float: """ Standalone function to calculate BLEU score. Args: references: Reference texts hypotheses: Generated texts Returns: BLEU score (0-100) """ metrics = Metrics() return metrics.calculate_bleu(references, hypotheses)