DataEngEval / src /scoring.py
uparekh01151's picture
Initial commit for DataEngEval
acd8e16
"""
Scoring Module
Handles normalization and composite scoring for SQL evaluation results.
"""
import math
import numpy as np
from typing import Dict, Any, List
from dataclasses import dataclass
@dataclass
class Metrics:
"""Evaluation metrics for a SQL query."""
correctness_exact: float # 0.0 or 1.0
result_match_f1: float # 0.0 to 1.0
exec_success: float # 0.0 or 1.0
latency_ms: float # milliseconds
readability: float # 0.0 to 1.0 (based on SQL structure)
dialect_ok: float # 0.0 or 1.0
class ScoringEngine:
"""Engine for computing composite scores from evaluation metrics."""
def __init__(self):
# Weights for composite scoring (sum should be 1.0)
self.weights = {
'correctness_exact': 0.4, # Most important
'exec_success': 0.25, # Very important
'result_match_f1': 0.15, # Important for partial credit
'dialect_ok': 0.1, # Important for dialect compliance
'readability': 0.05, # Minor factor
'latency': 0.05 # Minor factor (normalized)
}
# Latency normalization parameters
self.latency_min_ms = 10.0 # Minimum expected latency
self.latency_max_ms = 10000.0 # Maximum expected latency
def normalize_latency(self, latency_ms: float) -> float:
"""Normalize latency using log scale."""
if latency_ms <= 0:
return 0.0
# Clamp to reasonable bounds
latency_ms = max(self.latency_min_ms, min(latency_ms, self.latency_max_ms))
# Log normalization: log(latency) / log(max_latency)
normalized = math.log(latency_ms) / math.log(self.latency_max_ms)
# Invert so lower latency = higher score
return 1.0 - normalized
def compute_readability_score(self, sql: str) -> float:
"""Compute readability score based on SQL structure."""
if not sql or not sql.strip():
return 0.0
sql = sql.strip().upper()
score = 0.0
# Basic structure checks
if 'SELECT' in sql:
score += 0.2
if 'FROM' in sql:
score += 0.2
if sql.count('(') == sql.count(')'): # Balanced parentheses
score += 0.1
# Formatting checks
if '\n' in sql: # Multi-line formatting
score += 0.1
if sql.count(' ') > 5: # Proper spacing
score += 0.1
# Complexity checks (more complex = slightly lower readability)
complexity_penalty = 0.0
if sql.count('JOIN') > 2:
complexity_penalty += 0.1
if sql.count('CASE') > 0:
complexity_penalty += 0.05
if sql.count('(') > 3:
complexity_penalty += 0.05
score = max(0.0, score - complexity_penalty)
return min(1.0, score)
def compute_composite_score(self, metrics: Metrics) -> float:
"""Compute composite score from individual metrics."""
# Normalize latency
normalized_latency = self.normalize_latency(metrics.latency_ms)
# Compute readability if not provided
if metrics.readability == 0.0:
# This would need the actual SQL, but for now we'll use a default
metrics.readability = 0.8 # Default reasonable readability
# Weighted sum
composite_score = (
self.weights['correctness_exact'] * metrics.correctness_exact +
self.weights['exec_success'] * metrics.exec_success +
self.weights['result_match_f1'] * metrics.result_match_f1 +
self.weights['dialect_ok'] * metrics.dialect_ok +
self.weights['readability'] * metrics.readability +
self.weights['latency'] * normalized_latency
)
return round(composite_score, 4)
def compute_composite_score_from_dict(self, metrics_dict: Dict[str, Any]) -> float:
"""Compute composite score from metrics dictionary."""
metrics = Metrics(
correctness_exact=metrics_dict.get('correctness_exact', 0.0),
result_match_f1=metrics_dict.get('result_match_f1', 0.0),
exec_success=metrics_dict.get('exec_success', 0.0),
latency_ms=metrics_dict.get('latency_ms', 0.0),
readability=metrics_dict.get('readability', 0.0),
dialect_ok=metrics_dict.get('dialect_ok', 0.0)
)
return self.compute_composite_score(metrics)
def get_score_breakdown(self, metrics: Metrics) -> Dict[str, float]:
"""Get detailed breakdown of how the composite score was computed."""
normalized_latency = self.normalize_latency(metrics.latency_ms)
breakdown = {
'correctness_exact': self.weights['correctness_exact'] * metrics.correctness_exact,
'exec_success': self.weights['exec_success'] * metrics.exec_success,
'result_match_f1': self.weights['result_match_f1'] * metrics.result_match_f1,
'dialect_ok': self.weights['dialect_ok'] * metrics.dialect_ok,
'readability': self.weights['readability'] * metrics.readability,
'latency': self.weights['latency'] * normalized_latency,
'composite_score': self.compute_composite_score(metrics)
}
return breakdown
# Global scoring engine instance
scoring_engine = ScoringEngine()