Spaces:
Runtime error
Runtime error
| """ | |
| Metrics Module | |
| Provides various evaluation metrics for model performance assessment. | |
| """ | |
| import numpy as np | |
| from typing import List, Dict, Optional, Union | |
| import math | |
| class Metrics: | |
| """ | |
| Comprehensive metrics calculator for model evaluation. | |
| Supports: | |
| - BLEU score | |
| - ROUGE-L | |
| - Perplexity | |
| - Custom financial domain metrics | |
| """ | |
| def __init__(self): | |
| """Initialize metrics calculator.""" | |
| self.results = {} | |
| def calculate_bleu( | |
| self, | |
| references: List[str], | |
| hypotheses: List[str], | |
| max_n: int = 4 | |
| ) -> float: | |
| """ | |
| Calculate BLEU score. | |
| Args: | |
| references: Reference texts | |
| hypotheses: Generated texts | |
| max_n: Maximum n-gram size | |
| Returns: | |
| BLEU score (0-100) | |
| """ | |
| try: | |
| from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction | |
| # Tokenize | |
| ref_tokens = [[ref.split()] for ref in references] | |
| hyp_tokens = [hyp.split() for hyp in hypotheses] | |
| # Calculate with smoothing | |
| smoothing = SmoothingFunction() | |
| score = corpus_bleu( | |
| ref_tokens, | |
| hyp_tokens, | |
| smoothing_function=smoothing.method1 | |
| ) | |
| return score * 100.0 | |
| except ImportError: | |
| # Fallback: simple word overlap | |
| return self._simple_bleu(references, hypotheses) | |
| def _simple_bleu(self, references: List[str], hypotheses: List[str]) -> float: | |
| """Simple BLEU approximation without NLTK.""" | |
| total_overlap = 0 | |
| total_length = 0 | |
| for ref, hyp in zip(references, hypotheses): | |
| ref_words = set(ref.lower().split()) | |
| hyp_words = set(hyp.lower().split()) | |
| overlap = len(ref_words & hyp_words) | |
| total_overlap += overlap | |
| total_length += max(len(ref_words), len(hyp_words)) | |
| if total_length == 0: | |
| return 0.0 | |
| return (total_overlap / total_length) * 100.0 | |
| def calculate_rouge_l( | |
| self, | |
| references: List[str], | |
| hypotheses: List[str] | |
| ) -> Dict[str, float]: | |
| """ | |
| Calculate ROUGE-L score. | |
| Args: | |
| references: Reference texts | |
| hypotheses: Generated texts | |
| Returns: | |
| Dict with precision, recall, f1 | |
| """ | |
| total_precision = 0 | |
| total_recall = 0 | |
| total_f1 = 0 | |
| for ref, hyp in zip(references, hypotheses): | |
| ref_words = ref.split() | |
| hyp_words = hyp.split() | |
| # Find longest common subsequence | |
| lcs_length = self._lcs_length(ref_words, hyp_words) | |
| # Calculate metrics | |
| precision = lcs_length / len(hyp_words) if len(hyp_words) > 0 else 0 | |
| recall = lcs_length / len(ref_words) if len(ref_words) > 0 else 0 | |
| f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0 | |
| total_precision += precision | |
| total_recall += recall | |
| total_f1 += f1 | |
| n = len(references) | |
| return { | |
| 'precision': (total_precision / n) * 100.0 if n > 0 else 0.0, | |
| 'recall': (total_recall / n) * 100.0 if n > 0 else 0.0, | |
| 'f1': (total_f1 / n) * 100.0 if n > 0 else 0.0 | |
| } | |
| def _lcs_length(self, seq1: List[str], seq2: List[str]) -> int: | |
| """Calculate longest common subsequence length.""" | |
| m, n = len(seq1), len(seq2) | |
| dp = [[0] * (n + 1) for _ in range(m + 1)] | |
| for i in range(1, m + 1): | |
| for j in range(1, n + 1): | |
| if seq1[i-1] == seq2[j-1]: | |
| dp[i][j] = dp[i-1][j-1] + 1 | |
| else: | |
| dp[i][j] = max(dp[i-1][j], dp[i][j-1]) | |
| return dp[m][n] | |
| def calculate_perplexity( | |
| self, | |
| log_probs: List[float] | |
| ) -> float: | |
| """ | |
| Calculate perplexity from log probabilities. | |
| Args: | |
| log_probs: List of log probabilities | |
| Returns: | |
| Perplexity score | |
| """ | |
| if not log_probs: | |
| return float('inf') | |
| avg_log_prob = sum(log_probs) / len(log_probs) | |
| perplexity = math.exp(-avg_log_prob) | |
| return perplexity | |
| def calculate_accuracy( | |
| self, | |
| predictions: List[str], | |
| references: List[str] | |
| ) -> float: | |
| """ | |
| Calculate exact match accuracy. | |
| Args: | |
| predictions: Predicted answers | |
| references: Reference answers | |
| Returns: | |
| Accuracy percentage | |
| """ | |
| if not predictions or not references: | |
| return 0.0 | |
| matches = sum( | |
| pred.strip().lower() == ref.strip().lower() | |
| for pred, ref in zip(predictions, references) | |
| ) | |
| return (matches / len(predictions)) * 100.0 | |
| def calculate_all_metrics( | |
| self, | |
| predictions: List[str], | |
| references: List[str], | |
| log_probs: Optional[List[float]] = None | |
| ) -> Dict[str, float]: | |
| """ | |
| Calculate all available metrics. | |
| Args: | |
| predictions: Model predictions | |
| references: Reference answers | |
| log_probs: Optional log probabilities for perplexity | |
| Returns: | |
| Dict of all metrics | |
| """ | |
| metrics = {} | |
| # BLEU | |
| try: | |
| metrics['bleu'] = self.calculate_bleu(references, predictions) | |
| except Exception as e: | |
| print(f"BLEU calculation error: {e}") | |
| metrics['bleu'] = 0.0 | |
| # ROUGE-L | |
| try: | |
| rouge = self.calculate_rouge_l(references, predictions) | |
| metrics['rouge_l_precision'] = rouge['precision'] | |
| metrics['rouge_l_recall'] = rouge['recall'] | |
| metrics['rouge_l_f1'] = rouge['f1'] | |
| except Exception as e: | |
| print(f"ROUGE calculation error: {e}") | |
| metrics['rouge_l_f1'] = 0.0 | |
| # Accuracy | |
| try: | |
| metrics['accuracy'] = self.calculate_accuracy(predictions, references) | |
| except Exception as e: | |
| print(f"Accuracy calculation error: {e}") | |
| metrics['accuracy'] = 0.0 | |
| # Perplexity | |
| if log_probs: | |
| try: | |
| metrics['perplexity'] = self.calculate_perplexity(log_probs) | |
| except Exception as e: | |
| print(f"Perplexity calculation error: {e}") | |
| metrics['perplexity'] = float('inf') | |
| # Average response length | |
| metrics['avg_response_length'] = sum(len(p.split()) for p in predictions) / len(predictions) | |
| return metrics | |
| def calculate_perplexity(log_probs: List[float]) -> float: | |
| """ | |
| Standalone function to calculate perplexity. | |
| Args: | |
| log_probs: List of log probabilities | |
| Returns: | |
| Perplexity score | |
| """ | |
| metrics = Metrics() | |
| return metrics.calculate_perplexity(log_probs) | |
| def calculate_bleu(references: List[str], hypotheses: List[str]) -> float: | |
| """ | |
| Standalone function to calculate BLEU score. | |
| Args: | |
| references: Reference texts | |
| hypotheses: Generated texts | |
| Returns: | |
| BLEU score (0-100) | |
| """ | |
| metrics = Metrics() | |
| return metrics.calculate_bleu(references, hypotheses) | |