DataEngEval

Sleeping

App Files Files Community

DataEngEval / src /scoring.py

uparekh01151

Initial commit for DataEngEval

acd8e16 3 months ago

raw

history blame contribute delete

5.55 kB

	"""
	Scoring Module
	Handles normalization and composite scoring for SQL evaluation results.
	"""

	import math
	import numpy as np
	from typing import Dict, Any, List
	from dataclasses import dataclass


	@dataclass
	class Metrics:
	"""Evaluation metrics for a SQL query."""
	correctness_exact: float # 0.0 or 1.0
	result_match_f1: float # 0.0 to 1.0
	exec_success: float # 0.0 or 1.0
	latency_ms: float # milliseconds
	readability: float # 0.0 to 1.0 (based on SQL structure)
	dialect_ok: float # 0.0 or 1.0


	class ScoringEngine:
	"""Engine for computing composite scores from evaluation metrics."""

	def __init__(self):
	# Weights for composite scoring (sum should be 1.0)
	self.weights = {
	'correctness_exact': 0.4, # Most important
	'exec_success': 0.25, # Very important
	'result_match_f1': 0.15, # Important for partial credit
	'dialect_ok': 0.1, # Important for dialect compliance
	'readability': 0.05, # Minor factor
	'latency': 0.05 # Minor factor (normalized)
	}

	# Latency normalization parameters
	self.latency_min_ms = 10.0 # Minimum expected latency
	self.latency_max_ms = 10000.0 # Maximum expected latency

	def normalize_latency(self, latency_ms: float) -> float:
	"""Normalize latency using log scale."""
	if latency_ms <= 0:
	return 0.0

	# Clamp to reasonable bounds
	latency_ms = max(self.latency_min_ms, min(latency_ms, self.latency_max_ms))

	# Log normalization: log(latency) / log(max_latency)
	normalized = math.log(latency_ms) / math.log(self.latency_max_ms)

	# Invert so lower latency = higher score
	return 1.0 - normalized

	def compute_readability_score(self, sql: str) -> float:
	"""Compute readability score based on SQL structure."""
	if not sql or not sql.strip():
	return 0.0

	sql = sql.strip().upper()
	score = 0.0

	# Basic structure checks
	if 'SELECT' in sql:
	score += 0.2
	if 'FROM' in sql:
	score += 0.2
	if sql.count('(') == sql.count(')'): # Balanced parentheses
	score += 0.1

	# Formatting checks
	if '\n' in sql: # Multi-line formatting
	score += 0.1
	if sql.count(' ') > 5: # Proper spacing
	score += 0.1

	# Complexity checks (more complex = slightly lower readability)
	complexity_penalty = 0.0
	if sql.count('JOIN') > 2:
	complexity_penalty += 0.1
	if sql.count('CASE') > 0:
	complexity_penalty += 0.05
	if sql.count('(') > 3:
	complexity_penalty += 0.05

	score = max(0.0, score - complexity_penalty)
	return min(1.0, score)

	def compute_composite_score(self, metrics: Metrics) -> float:
	"""Compute composite score from individual metrics."""
	# Normalize latency
	normalized_latency = self.normalize_latency(metrics.latency_ms)

	# Compute readability if not provided
	if metrics.readability == 0.0:
	# This would need the actual SQL, but for now we'll use a default
	metrics.readability = 0.8 # Default reasonable readability

	# Weighted sum
	composite_score = (
	self.weights['correctness_exact'] * metrics.correctness_exact +
	self.weights['exec_success'] * metrics.exec_success +
	self.weights['result_match_f1'] * metrics.result_match_f1 +
	self.weights['dialect_ok'] * metrics.dialect_ok +
	self.weights['readability'] * metrics.readability +
	self.weights['latency'] * normalized_latency
	)

	return round(composite_score, 4)

	def compute_composite_score_from_dict(self, metrics_dict: Dict[str, Any]) -> float:
	"""Compute composite score from metrics dictionary."""
	metrics = Metrics(
	correctness_exact=metrics_dict.get('correctness_exact', 0.0),
	result_match_f1=metrics_dict.get('result_match_f1', 0.0),
	exec_success=metrics_dict.get('exec_success', 0.0),
	latency_ms=metrics_dict.get('latency_ms', 0.0),
	readability=metrics_dict.get('readability', 0.0),
	dialect_ok=metrics_dict.get('dialect_ok', 0.0)
	)

	return self.compute_composite_score(metrics)

	def get_score_breakdown(self, metrics: Metrics) -> Dict[str, float]:
	"""Get detailed breakdown of how the composite score was computed."""
	normalized_latency = self.normalize_latency(metrics.latency_ms)

	breakdown = {
	'correctness_exact': self.weights['correctness_exact'] * metrics.correctness_exact,
	'exec_success': self.weights['exec_success'] * metrics.exec_success,
	'result_match_f1': self.weights['result_match_f1'] * metrics.result_match_f1,
	'dialect_ok': self.weights['dialect_ok'] * metrics.dialect_ok,
	'readability': self.weights['readability'] * metrics.readability,
	'latency': self.weights['latency'] * normalized_latency,
	'composite_score': self.compute_composite_score(metrics)
	}

	return breakdown


	# Global scoring engine instance
	scoring_engine = ScoringEngine()