Spaces:

Bmccloud22
/

LaunchLLM

Runtime error

File size: 7,374 Bytes

ec8f374

"""
Metrics Module

Provides various evaluation metrics for model performance assessment.
"""

import numpy as np
from typing import List, Dict, Optional, Union
import math


class Metrics:
    """
    Comprehensive metrics calculator for model evaluation.

    Supports:
    - BLEU score
    - ROUGE-L
    - Perplexity
    - Custom financial domain metrics
    """

    def __init__(self):
        """Initialize metrics calculator."""
        self.results = {}

    def calculate_bleu(
        self,
        references: List[str],
        hypotheses: List[str],
        max_n: int = 4
    ) -> float:
        """
        Calculate BLEU score.

        Args:
            references: Reference texts
            hypotheses: Generated texts
            max_n: Maximum n-gram size

        Returns:
            BLEU score (0-100)
        """
        try:
            from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

            # Tokenize
            ref_tokens = [[ref.split()] for ref in references]
            hyp_tokens = [hyp.split() for hyp in hypotheses]

            # Calculate with smoothing
            smoothing = SmoothingFunction()
            score = corpus_bleu(
                ref_tokens,
                hyp_tokens,
                smoothing_function=smoothing.method1
            )

            return score * 100.0

        except ImportError:
            # Fallback: simple word overlap
            return self._simple_bleu(references, hypotheses)

    def _simple_bleu(self, references: List[str], hypotheses: List[str]) -> float:
        """Simple BLEU approximation without NLTK."""
        total_overlap = 0
        total_length = 0

        for ref, hyp in zip(references, hypotheses):
            ref_words = set(ref.lower().split())
            hyp_words = set(hyp.lower().split())
            overlap = len(ref_words & hyp_words)
            total_overlap += overlap
            total_length += max(len(ref_words), len(hyp_words))

        if total_length == 0:
            return 0.0

        return (total_overlap / total_length) * 100.0

    def calculate_rouge_l(
        self,
        references: List[str],
        hypotheses: List[str]
    ) -> Dict[str, float]:
        """
        Calculate ROUGE-L score.

        Args:
            references: Reference texts
            hypotheses: Generated texts

        Returns:
            Dict with precision, recall, f1
        """
        total_precision = 0
        total_recall = 0
        total_f1 = 0

        for ref, hyp in zip(references, hypotheses):
            ref_words = ref.split()
            hyp_words = hyp.split()

            # Find longest common subsequence
            lcs_length = self._lcs_length(ref_words, hyp_words)

            # Calculate metrics
            precision = lcs_length / len(hyp_words) if len(hyp_words) > 0 else 0
            recall = lcs_length / len(ref_words) if len(ref_words) > 0 else 0
            f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

            total_precision += precision
            total_recall += recall
            total_f1 += f1

        n = len(references)
        return {
            'precision': (total_precision / n) * 100.0 if n > 0 else 0.0,
            'recall': (total_recall / n) * 100.0 if n > 0 else 0.0,
            'f1': (total_f1 / n) * 100.0 if n > 0 else 0.0
        }

    def _lcs_length(self, seq1: List[str], seq2: List[str]) -> int:
        """Calculate longest common subsequence length."""
        m, n = len(seq1), len(seq2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]

        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if seq1[i-1] == seq2[j-1]:
                    dp[i][j] = dp[i-1][j-1] + 1
                else:
                    dp[i][j] = max(dp[i-1][j], dp[i][j-1])

        return dp[m][n]

    def calculate_perplexity(
        self,
        log_probs: List[float]
    ) -> float:
        """
        Calculate perplexity from log probabilities.

        Args:
            log_probs: List of log probabilities

        Returns:
            Perplexity score
        """
        if not log_probs:
            return float('inf')

        avg_log_prob = sum(log_probs) / len(log_probs)
        perplexity = math.exp(-avg_log_prob)

        return perplexity

    def calculate_accuracy(
        self,
        predictions: List[str],
        references: List[str]
    ) -> float:
        """
        Calculate exact match accuracy.

        Args:
            predictions: Predicted answers
            references: Reference answers

        Returns:
            Accuracy percentage
        """
        if not predictions or not references:
            return 0.0

        matches = sum(
            pred.strip().lower() == ref.strip().lower()
            for pred, ref in zip(predictions, references)
        )

        return (matches / len(predictions)) * 100.0

    def calculate_all_metrics(
        self,
        predictions: List[str],
        references: List[str],
        log_probs: Optional[List[float]] = None
    ) -> Dict[str, float]:
        """
        Calculate all available metrics.

        Args:
            predictions: Model predictions
            references: Reference answers
            log_probs: Optional log probabilities for perplexity

        Returns:
            Dict of all metrics
        """
        metrics = {}

        # BLEU
        try:
            metrics['bleu'] = self.calculate_bleu(references, predictions)
        except Exception as e:
            print(f"BLEU calculation error: {e}")
            metrics['bleu'] = 0.0

        # ROUGE-L
        try:
            rouge = self.calculate_rouge_l(references, predictions)
            metrics['rouge_l_precision'] = rouge['precision']
            metrics['rouge_l_recall'] = rouge['recall']
            metrics['rouge_l_f1'] = rouge['f1']
        except Exception as e:
            print(f"ROUGE calculation error: {e}")
            metrics['rouge_l_f1'] = 0.0

        # Accuracy
        try:
            metrics['accuracy'] = self.calculate_accuracy(predictions, references)
        except Exception as e:
            print(f"Accuracy calculation error: {e}")
            metrics['accuracy'] = 0.0

        # Perplexity
        if log_probs:
            try:
                metrics['perplexity'] = self.calculate_perplexity(log_probs)
            except Exception as e:
                print(f"Perplexity calculation error: {e}")
                metrics['perplexity'] = float('inf')

        # Average response length
        metrics['avg_response_length'] = sum(len(p.split()) for p in predictions) / len(predictions)

        return metrics


def calculate_perplexity(log_probs: List[float]) -> float:
    """
    Standalone function to calculate perplexity.

    Args:
        log_probs: List of log probabilities

    Returns:
        Perplexity score
    """
    metrics = Metrics()
    return metrics.calculate_perplexity(log_probs)


def calculate_bleu(references: List[str], hypotheses: List[str]) -> float:
    """
    Standalone function to calculate BLEU score.

    Args:
        references: Reference texts
        hypotheses: Generated texts

    Returns:
        BLEU score (0-100)
    """
    metrics = Metrics()
    return metrics.calculate_bleu(references, hypotheses)