Spaces:

newmindai
/

Mizan

Running

File size: 7,620 Bytes

9a235dc

#!/usr/bin/env python3
"""
Data Processing module for MTEB Turkish Leaderboard - HF Spaces Version
Simplified version for loading and processing CSV data
"""

import os
import pandas as pd
from pandas.io.formats.style import Styler
from matplotlib.colors import LinearSegmentedColormap
import html

# CSV file path
CSV_FILE_PATH = "leaderboard_data.csv"


def load_leaderboard_from_csv() -> pd.DataFrame:
    """Load leaderboard data from CSV file"""
    try:
        if not os.path.exists(CSV_FILE_PATH):
            print(f"❌ CSV file not found: {CSV_FILE_PATH}")
            return create_empty_leaderboard_dataframe()
        
        df = pd.read_csv(CSV_FILE_PATH)
        print(f"✅ Loaded {len(df)} records from {CSV_FILE_PATH}")
        
        # Convert to leaderboard format
        leaderboard_df = csv_to_leaderboard_format(df)
        
        # Sort by Mean (Task) score and add rankings
        leaderboard_df = leaderboard_df.sort_values("Mean (Task)", ascending=False).reset_index(drop=True)
        leaderboard_df["Rank"] = range(1, len(leaderboard_df) + 1)
        
        return leaderboard_df
        
    except Exception as e:
        print(f"❌ Error loading CSV: {e}")
        return create_empty_leaderboard_dataframe()


def create_empty_leaderboard_dataframe() -> pd.DataFrame:
    """Create an empty DataFrame with proper leaderboard column structure"""
    return pd.DataFrame(columns=[
        "Rank",
        "Model", 
        "Mean (Task)",
        "Mean (TaskType)",
        "Classification",
        "Clustering", 
        "Pair Classification",
        "Retrieval",
        "STS",
        "Correlation",
        "Parameters",
        "Embed Dim",
        "Max Sequence Length",
        "Vocab Size",
    ])


def csv_to_leaderboard_format(df: pd.DataFrame) -> pd.DataFrame:
    """Convert CSV data to leaderboard format"""
    data = []
    for idx, row in df.iterrows():
        model_name = row['Model']
        
        # Prepare model name for display
        model_name_clean = html.escape(model_name)
        
        # Create clickable HuggingFace link for model name
        hf_link = f"https://huggingface.co/{model_name_clean}"
        clickable_model = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{model_name_clean}</a>'
        
        # Handle different column name variations
        embedding_dim_col = 'Embedding Dim'
        max_seq_col = 'Max Seq Length'
        pair_classification_col = 'Pair Classification'
        
        data_row = {
            "Rank": idx + 1,  # Initial ranking, will be recalculated
            "Model": clickable_model,
            "Mean (Task)": round(float(row['Mean (Task)']), 2),
            "Mean (TaskType)": round(float(row['Mean (TaskType)']), 2),
            "Classification": round(float(row['Classification']), 2),
            "Clustering": round(float(row['Clustering']), 2),
            "Pair Classification": round(float(row[pair_classification_col]), 2),
            "Retrieval": round(float(row['Retrieval']), 2),
            "STS": round(float(row['STS']), 2),
            "Correlation": round(float(row['Correlation']), 3) if not pd.isna(row['Correlation']) else "N/A",
            "Parameters": row['Number of Parameters'],
            "Embed Dim": int(float(row[embedding_dim_col])) if not pd.isna(row[embedding_dim_col]) else 0,
            "Max Sequence Length": "N/A" if pd.isna(row[max_seq_col]) or row[max_seq_col] == "Unknown" else int(float(row[max_seq_col])),
            "Vocab Size": int(float(row['Vocab Size'])) if 'Vocab Size' in row and not pd.isna(row['Vocab Size']) else 0
        }
        data.append(data_row)
    
    result_df = pd.DataFrame(data)
    return result_df


def create_excel_like_cmap():
    """Create Excel-like colormap for score visualization"""
    colors = [
        (0.9, 0.1, 0.2),       # Red
        (1.0, 1.0, 0.0),       # Yellow
        (0/255, 176/255, 80/255)  # Excel-style Green
    ]

    return LinearSegmentedColormap.from_list("excel_like", colors, N=256)


def rgb_to_hex(rgb_tuple):
    """Convert RGB tuple to hex color"""
    r, g, b = [int(x * 255) for x in rgb_tuple[:3]]
    return f"#{r:02x}{g:02x}{b:02x}"


def create_colored_cell(value: float, min_val: float, max_val: float, colormap) -> str:
    """Create colored cell HTML for score visualization"""
    if pd.isna(value) or value == "N/A":
        return str(value)
    
    try:
        # Normalize value to 0-1 range
        if max_val > min_val:
            normalized = (float(value) - min_val) / (max_val - min_val)
        else:
            normalized = 0.5
        
        # Get color from colormap
        color_rgba = colormap(normalized)
        color_hex = rgb_to_hex(color_rgba)
        
        # Create colored cell HTML with data-sort attribute for proper numeric sorting
        return f'<div style="background-color: {color_hex}; padding: 4px 8px; border-radius: 4px; text-align: center; font-weight: bold; color: #333;" data-sort="{value}">{value}</div>'
    
    except (ValueError, TypeError):
        return str(value)


def create_styled_leaderboard_dataframe(df: pd.DataFrame) -> Styler:
    """Create styled leaderboard dataframe with color coding and clickable model names using pandas Styler
    
    Returns a pandas Styler object that Gradio Dataframe can render with both colors AND correct sorting.
    """
    if df.empty:
        return df.style
    
    colormap = create_excel_like_cmap()
    
    # Score columns to colorize
    score_columns = ["Mean (Task)", "Mean (TaskType)", "Classification", "Clustering", 
                    "Pair Classification", "Retrieval", "STS", "Correlation"]
    
    # Calculate min/max for each score column for normalization
    color_ranges = {}
    for col in score_columns:
        if col in df.columns:
            numeric_values = pd.to_numeric(df[col], errors='coerce')
            if not numeric_values.isna().all():
                color_ranges[col] = {
                    'min': numeric_values.min(),
                    'max': numeric_values.max()
                }
    
    # Create styler with background colors for score columns
    def apply_color_gradient(val, col_name):
        """Apply background color based on value"""
        if col_name not in color_ranges:
            return ''
        
        if pd.isna(val) or val == "N/A":
            return ''
        
        try:
            min_val = color_ranges[col_name]['min']
            max_val = color_ranges[col_name]['max']
            
            # Normalize value to 0-1 range
            if max_val > min_val:
                normalized = (float(val) - min_val) / (max_val - min_val)
            else:
                normalized = 0.5
            
            # Get color from colormap
            color_rgba = colormap(normalized)
            color_hex = rgb_to_hex(color_rgba)
            
            return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
        except (ValueError, TypeError):
            return ''
    
    # Apply styling to score columns using map (applymap is deprecated)
    styler = df.style
    for col in score_columns:
        if col in df.columns:
            styler = styler.map(lambda val, c=col: apply_color_gradient(val, c), subset=[col])
    
    # Format score columns to 2 decimal places
    format_dict = {}
    for col in score_columns:
        if col in df.columns:
            format_dict[col] = '{:.2f}'
    
    if format_dict:
        styler = styler.format(format_dict, na_rep='N/A')
    
    return styler