Spaces:
Sleeping
Sleeping
| """ | |
| Scoring Module | |
| Handles normalization and composite scoring for SQL evaluation results. | |
| """ | |
| import math | |
| import numpy as np | |
| from typing import Dict, Any, List | |
| from dataclasses import dataclass | |
| class Metrics: | |
| """Evaluation metrics for a SQL query.""" | |
| correctness_exact: float # 0.0 or 1.0 | |
| result_match_f1: float # 0.0 to 1.0 | |
| exec_success: float # 0.0 or 1.0 | |
| latency_ms: float # milliseconds | |
| readability: float # 0.0 to 1.0 (based on SQL structure) | |
| dialect_ok: float # 0.0 or 1.0 | |
| class ScoringEngine: | |
| """Engine for computing composite scores from evaluation metrics.""" | |
| def __init__(self): | |
| # Weights for composite scoring (sum should be 1.0) | |
| self.weights = { | |
| 'correctness_exact': 0.4, # Most important | |
| 'exec_success': 0.25, # Very important | |
| 'result_match_f1': 0.15, # Important for partial credit | |
| 'dialect_ok': 0.1, # Important for dialect compliance | |
| 'readability': 0.05, # Minor factor | |
| 'latency': 0.05 # Minor factor (normalized) | |
| } | |
| # Latency normalization parameters | |
| self.latency_min_ms = 10.0 # Minimum expected latency | |
| self.latency_max_ms = 10000.0 # Maximum expected latency | |
| def normalize_latency(self, latency_ms: float) -> float: | |
| """Normalize latency using log scale.""" | |
| if latency_ms <= 0: | |
| return 0.0 | |
| # Clamp to reasonable bounds | |
| latency_ms = max(self.latency_min_ms, min(latency_ms, self.latency_max_ms)) | |
| # Log normalization: log(latency) / log(max_latency) | |
| normalized = math.log(latency_ms) / math.log(self.latency_max_ms) | |
| # Invert so lower latency = higher score | |
| return 1.0 - normalized | |
| def compute_readability_score(self, sql: str) -> float: | |
| """Compute readability score based on SQL structure.""" | |
| if not sql or not sql.strip(): | |
| return 0.0 | |
| sql = sql.strip().upper() | |
| score = 0.0 | |
| # Basic structure checks | |
| if 'SELECT' in sql: | |
| score += 0.2 | |
| if 'FROM' in sql: | |
| score += 0.2 | |
| if sql.count('(') == sql.count(')'): # Balanced parentheses | |
| score += 0.1 | |
| # Formatting checks | |
| if '\n' in sql: # Multi-line formatting | |
| score += 0.1 | |
| if sql.count(' ') > 5: # Proper spacing | |
| score += 0.1 | |
| # Complexity checks (more complex = slightly lower readability) | |
| complexity_penalty = 0.0 | |
| if sql.count('JOIN') > 2: | |
| complexity_penalty += 0.1 | |
| if sql.count('CASE') > 0: | |
| complexity_penalty += 0.05 | |
| if sql.count('(') > 3: | |
| complexity_penalty += 0.05 | |
| score = max(0.0, score - complexity_penalty) | |
| return min(1.0, score) | |
| def compute_composite_score(self, metrics: Metrics) -> float: | |
| """Compute composite score from individual metrics.""" | |
| # Normalize latency | |
| normalized_latency = self.normalize_latency(metrics.latency_ms) | |
| # Compute readability if not provided | |
| if metrics.readability == 0.0: | |
| # This would need the actual SQL, but for now we'll use a default | |
| metrics.readability = 0.8 # Default reasonable readability | |
| # Weighted sum | |
| composite_score = ( | |
| self.weights['correctness_exact'] * metrics.correctness_exact + | |
| self.weights['exec_success'] * metrics.exec_success + | |
| self.weights['result_match_f1'] * metrics.result_match_f1 + | |
| self.weights['dialect_ok'] * metrics.dialect_ok + | |
| self.weights['readability'] * metrics.readability + | |
| self.weights['latency'] * normalized_latency | |
| ) | |
| return round(composite_score, 4) | |
| def compute_composite_score_from_dict(self, metrics_dict: Dict[str, Any]) -> float: | |
| """Compute composite score from metrics dictionary.""" | |
| metrics = Metrics( | |
| correctness_exact=metrics_dict.get('correctness_exact', 0.0), | |
| result_match_f1=metrics_dict.get('result_match_f1', 0.0), | |
| exec_success=metrics_dict.get('exec_success', 0.0), | |
| latency_ms=metrics_dict.get('latency_ms', 0.0), | |
| readability=metrics_dict.get('readability', 0.0), | |
| dialect_ok=metrics_dict.get('dialect_ok', 0.0) | |
| ) | |
| return self.compute_composite_score(metrics) | |
| def get_score_breakdown(self, metrics: Metrics) -> Dict[str, float]: | |
| """Get detailed breakdown of how the composite score was computed.""" | |
| normalized_latency = self.normalize_latency(metrics.latency_ms) | |
| breakdown = { | |
| 'correctness_exact': self.weights['correctness_exact'] * metrics.correctness_exact, | |
| 'exec_success': self.weights['exec_success'] * metrics.exec_success, | |
| 'result_match_f1': self.weights['result_match_f1'] * metrics.result_match_f1, | |
| 'dialect_ok': self.weights['dialect_ok'] * metrics.dialect_ok, | |
| 'readability': self.weights['readability'] * metrics.readability, | |
| 'latency': self.weights['latency'] * normalized_latency, | |
| 'composite_score': self.compute_composite_score(metrics) | |
| } | |
| return breakdown | |
| # Global scoring engine instance | |
| scoring_engine = ScoringEngine() | |