DataEngEval

Sleeping

File size: 5,551 Bytes

acd8e16

"""
Scoring Module
Handles normalization and composite scoring for SQL evaluation results.
"""

import math
import numpy as np
from typing import Dict, Any, List
from dataclasses import dataclass


@dataclass
class Metrics:
    """Evaluation metrics for a SQL query."""
    correctness_exact: float  # 0.0 or 1.0
    result_match_f1: float    # 0.0 to 1.0
    exec_success: float       # 0.0 or 1.0
    latency_ms: float         # milliseconds
    readability: float        # 0.0 to 1.0 (based on SQL structure)
    dialect_ok: float         # 0.0 or 1.0


class ScoringEngine:
    """Engine for computing composite scores from evaluation metrics."""
    
    def __init__(self):
        # Weights for composite scoring (sum should be 1.0)
        self.weights = {
            'correctness_exact': 0.4,    # Most important
            'exec_success': 0.25,        # Very important
            'result_match_f1': 0.15,     # Important for partial credit
            'dialect_ok': 0.1,           # Important for dialect compliance
            'readability': 0.05,         # Minor factor
            'latency': 0.05              # Minor factor (normalized)
        }
        
        # Latency normalization parameters
        self.latency_min_ms = 10.0       # Minimum expected latency
        self.latency_max_ms = 10000.0    # Maximum expected latency
    
    def normalize_latency(self, latency_ms: float) -> float:
        """Normalize latency using log scale."""
        if latency_ms <= 0:
            return 0.0
        
        # Clamp to reasonable bounds
        latency_ms = max(self.latency_min_ms, min(latency_ms, self.latency_max_ms))
        
        # Log normalization: log(latency) / log(max_latency)
        normalized = math.log(latency_ms) / math.log(self.latency_max_ms)
        
        # Invert so lower latency = higher score
        return 1.0 - normalized
    
    def compute_readability_score(self, sql: str) -> float:
        """Compute readability score based on SQL structure."""
        if not sql or not sql.strip():
            return 0.0
        
        sql = sql.strip().upper()
        score = 0.0
        
        # Basic structure checks
        if 'SELECT' in sql:
            score += 0.2
        if 'FROM' in sql:
            score += 0.2
        if sql.count('(') == sql.count(')'):  # Balanced parentheses
            score += 0.1
        
        # Formatting checks
        if '\n' in sql:  # Multi-line formatting
            score += 0.1
        if sql.count(' ') > 5:  # Proper spacing
            score += 0.1
        
        # Complexity checks (more complex = slightly lower readability)
        complexity_penalty = 0.0
        if sql.count('JOIN') > 2:
            complexity_penalty += 0.1
        if sql.count('CASE') > 0:
            complexity_penalty += 0.05
        if sql.count('(') > 3:
            complexity_penalty += 0.05
        
        score = max(0.0, score - complexity_penalty)
        return min(1.0, score)
    
    def compute_composite_score(self, metrics: Metrics) -> float:
        """Compute composite score from individual metrics."""
        # Normalize latency
        normalized_latency = self.normalize_latency(metrics.latency_ms)
        
        # Compute readability if not provided
        if metrics.readability == 0.0:
            # This would need the actual SQL, but for now we'll use a default
            metrics.readability = 0.8  # Default reasonable readability
        
        # Weighted sum
        composite_score = (
            self.weights['correctness_exact'] * metrics.correctness_exact +
            self.weights['exec_success'] * metrics.exec_success +
            self.weights['result_match_f1'] * metrics.result_match_f1 +
            self.weights['dialect_ok'] * metrics.dialect_ok +
            self.weights['readability'] * metrics.readability +
            self.weights['latency'] * normalized_latency
        )
        
        return round(composite_score, 4)
    
    def compute_composite_score_from_dict(self, metrics_dict: Dict[str, Any]) -> float:
        """Compute composite score from metrics dictionary."""
        metrics = Metrics(
            correctness_exact=metrics_dict.get('correctness_exact', 0.0),
            result_match_f1=metrics_dict.get('result_match_f1', 0.0),
            exec_success=metrics_dict.get('exec_success', 0.0),
            latency_ms=metrics_dict.get('latency_ms', 0.0),
            readability=metrics_dict.get('readability', 0.0),
            dialect_ok=metrics_dict.get('dialect_ok', 0.0)
        )
        
        return self.compute_composite_score(metrics)
    
    def get_score_breakdown(self, metrics: Metrics) -> Dict[str, float]:
        """Get detailed breakdown of how the composite score was computed."""
        normalized_latency = self.normalize_latency(metrics.latency_ms)
        
        breakdown = {
            'correctness_exact': self.weights['correctness_exact'] * metrics.correctness_exact,
            'exec_success': self.weights['exec_success'] * metrics.exec_success,
            'result_match_f1': self.weights['result_match_f1'] * metrics.result_match_f1,
            'dialect_ok': self.weights['dialect_ok'] * metrics.dialect_ok,
            'readability': self.weights['readability'] * metrics.readability,
            'latency': self.weights['latency'] * normalized_latency,
            'composite_score': self.compute_composite_score(metrics)
        }
        
        return breakdown


# Global scoring engine instance
scoring_engine = ScoringEngine()