Spaces:

Bmccloud22
/

LaunchLLM

Runtime error

App Files Files Community

LaunchLLM / evaluation /metrics.py

Bmccloud22

Deploy LaunchLLM - Production AI Training Platform

ec8f374 verified about 1 month ago

raw

history blame contribute delete

7.37 kB

	"""
	Metrics Module

	Provides various evaluation metrics for model performance assessment.
	"""

	import numpy as np
	from typing import List, Dict, Optional, Union
	import math


	class Metrics:
	"""
	Comprehensive metrics calculator for model evaluation.

	Supports:
	- BLEU score
	- ROUGE-L
	- Perplexity
	- Custom financial domain metrics
	"""

	def __init__(self):
	"""Initialize metrics calculator."""
	self.results = {}

	def calculate_bleu(
	self,
	references: List[str],
	hypotheses: List[str],
	max_n: int = 4
	) -> float:
	"""
	Calculate BLEU score.

	Args:
	references: Reference texts
	hypotheses: Generated texts
	max_n: Maximum n-gram size

	Returns:
	BLEU score (0-100)
	"""
	try:
	from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

	# Tokenize
	ref_tokens = [[ref.split()] for ref in references]
	hyp_tokens = [hyp.split() for hyp in hypotheses]

	# Calculate with smoothing
	smoothing = SmoothingFunction()
	score = corpus_bleu(
	ref_tokens,
	hyp_tokens,
	smoothing_function=smoothing.method1
	)

	return score * 100.0

	except ImportError:
	# Fallback: simple word overlap
	return self._simple_bleu(references, hypotheses)

	def _simple_bleu(self, references: List[str], hypotheses: List[str]) -> float:
	"""Simple BLEU approximation without NLTK."""
	total_overlap = 0
	total_length = 0

	for ref, hyp in zip(references, hypotheses):
	ref_words = set(ref.lower().split())
	hyp_words = set(hyp.lower().split())
	overlap = len(ref_words & hyp_words)
	total_overlap += overlap
	total_length += max(len(ref_words), len(hyp_words))

	if total_length == 0:
	return 0.0

	return (total_overlap / total_length) * 100.0

	def calculate_rouge_l(
	self,
	references: List[str],
	hypotheses: List[str]
	) -> Dict[str, float]:
	"""
	Calculate ROUGE-L score.

	Args:
	references: Reference texts
	hypotheses: Generated texts

	Returns:
	Dict with precision, recall, f1
	"""
	total_precision = 0
	total_recall = 0
	total_f1 = 0

	for ref, hyp in zip(references, hypotheses):
	ref_words = ref.split()
	hyp_words = hyp.split()

	# Find longest common subsequence
	lcs_length = self._lcs_length(ref_words, hyp_words)

	# Calculate metrics
	precision = lcs_length / len(hyp_words) if len(hyp_words) > 0 else 0
	recall = lcs_length / len(ref_words) if len(ref_words) > 0 else 0
	f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

	total_precision += precision
	total_recall += recall
	total_f1 += f1

	n = len(references)
	return {
	'precision': (total_precision / n) * 100.0 if n > 0 else 0.0,
	'recall': (total_recall / n) * 100.0 if n > 0 else 0.0,
	'f1': (total_f1 / n) * 100.0 if n > 0 else 0.0
	}

	def _lcs_length(self, seq1: List[str], seq2: List[str]) -> int:
	"""Calculate longest common subsequence length."""
	m, n = len(seq1), len(seq2)
	dp = [[0] * (n + 1) for _ in range(m + 1)]

	for i in range(1, m + 1):
	for j in range(1, n + 1):
	if seq1[i-1] == seq2[j-1]:
	dp[i][j] = dp[i-1][j-1] + 1
	else:
	dp[i][j] = max(dp[i-1][j], dp[i][j-1])

	return dp[m][n]

	def calculate_perplexity(
	self,
	log_probs: List[float]
	) -> float:
	"""
	Calculate perplexity from log probabilities.

	Args:
	log_probs: List of log probabilities

	Returns:
	Perplexity score
	"""
	if not log_probs:
	return float('inf')

	avg_log_prob = sum(log_probs) / len(log_probs)
	perplexity = math.exp(-avg_log_prob)

	return perplexity

	def calculate_accuracy(
	self,
	predictions: List[str],
	references: List[str]
	) -> float:
	"""
	Calculate exact match accuracy.

	Args:
	predictions: Predicted answers
	references: Reference answers

	Returns:
	Accuracy percentage
	"""
	if not predictions or not references:
	return 0.0

	matches = sum(
	pred.strip().lower() == ref.strip().lower()
	for pred, ref in zip(predictions, references)
	)

	return (matches / len(predictions)) * 100.0

	def calculate_all_metrics(
	self,
	predictions: List[str],
	references: List[str],
	log_probs: Optional[List[float]] = None
	) -> Dict[str, float]:
	"""
	Calculate all available metrics.

	Args:
	predictions: Model predictions
	references: Reference answers
	log_probs: Optional log probabilities for perplexity

	Returns:
	Dict of all metrics
	"""
	metrics = {}

	# BLEU
	try:
	metrics['bleu'] = self.calculate_bleu(references, predictions)
	except Exception as e:
	print(f"BLEU calculation error: {e}")
	metrics['bleu'] = 0.0

	# ROUGE-L
	try:
	rouge = self.calculate_rouge_l(references, predictions)
	metrics['rouge_l_precision'] = rouge['precision']
	metrics['rouge_l_recall'] = rouge['recall']
	metrics['rouge_l_f1'] = rouge['f1']
	except Exception as e:
	print(f"ROUGE calculation error: {e}")
	metrics['rouge_l_f1'] = 0.0

	# Accuracy
	try:
	metrics['accuracy'] = self.calculate_accuracy(predictions, references)
	except Exception as e:
	print(f"Accuracy calculation error: {e}")
	metrics['accuracy'] = 0.0

	# Perplexity
	if log_probs:
	try:
	metrics['perplexity'] = self.calculate_perplexity(log_probs)
	except Exception as e:
	print(f"Perplexity calculation error: {e}")
	metrics['perplexity'] = float('inf')

	# Average response length
	metrics['avg_response_length'] = sum(len(p.split()) for p in predictions) / len(predictions)

	return metrics


	def calculate_perplexity(log_probs: List[float]) -> float:
	"""
	Standalone function to calculate perplexity.

	Args:
	log_probs: List of log probabilities

	Returns:
	Perplexity score
	"""
	metrics = Metrics()
	return metrics.calculate_perplexity(log_probs)


	def calculate_bleu(references: List[str], hypotheses: List[str]) -> float:
	"""
	Standalone function to calculate BLEU score.

	Args:
	references: Reference texts
	hypotheses: Generated texts

	Returns:
	BLEU score (0-100)
	"""
	metrics = Metrics()
	return metrics.calculate_bleu(references, hypotheses)