Mizan / data_processor.py
nmmursit's picture
Initial commit
9a235dc
raw
history blame
7.62 kB
#!/usr/bin/env python3
"""
Data Processing module for MTEB Turkish Leaderboard - HF Spaces Version
Simplified version for loading and processing CSV data
"""
import os
import pandas as pd
from pandas.io.formats.style import Styler
from matplotlib.colors import LinearSegmentedColormap
import html
# CSV file path
CSV_FILE_PATH = "leaderboard_data.csv"
def load_leaderboard_from_csv() -> pd.DataFrame:
"""Load leaderboard data from CSV file"""
try:
if not os.path.exists(CSV_FILE_PATH):
print(f"❌ CSV file not found: {CSV_FILE_PATH}")
return create_empty_leaderboard_dataframe()
df = pd.read_csv(CSV_FILE_PATH)
print(f"βœ… Loaded {len(df)} records from {CSV_FILE_PATH}")
# Convert to leaderboard format
leaderboard_df = csv_to_leaderboard_format(df)
# Sort by Mean (Task) score and add rankings
leaderboard_df = leaderboard_df.sort_values("Mean (Task)", ascending=False).reset_index(drop=True)
leaderboard_df["Rank"] = range(1, len(leaderboard_df) + 1)
return leaderboard_df
except Exception as e:
print(f"❌ Error loading CSV: {e}")
return create_empty_leaderboard_dataframe()
def create_empty_leaderboard_dataframe() -> pd.DataFrame:
"""Create an empty DataFrame with proper leaderboard column structure"""
return pd.DataFrame(columns=[
"Rank",
"Model",
"Mean (Task)",
"Mean (TaskType)",
"Classification",
"Clustering",
"Pair Classification",
"Retrieval",
"STS",
"Correlation",
"Parameters",
"Embed Dim",
"Max Sequence Length",
"Vocab Size",
])
def csv_to_leaderboard_format(df: pd.DataFrame) -> pd.DataFrame:
"""Convert CSV data to leaderboard format"""
data = []
for idx, row in df.iterrows():
model_name = row['Model']
# Prepare model name for display
model_name_clean = html.escape(model_name)
# Create clickable HuggingFace link for model name
hf_link = f"https://huggingface.co/{model_name_clean}"
clickable_model = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{model_name_clean}</a>'
# Handle different column name variations
embedding_dim_col = 'Embedding Dim'
max_seq_col = 'Max Seq Length'
pair_classification_col = 'Pair Classification'
data_row = {
"Rank": idx + 1, # Initial ranking, will be recalculated
"Model": clickable_model,
"Mean (Task)": round(float(row['Mean (Task)']), 2),
"Mean (TaskType)": round(float(row['Mean (TaskType)']), 2),
"Classification": round(float(row['Classification']), 2),
"Clustering": round(float(row['Clustering']), 2),
"Pair Classification": round(float(row[pair_classification_col]), 2),
"Retrieval": round(float(row['Retrieval']), 2),
"STS": round(float(row['STS']), 2),
"Correlation": round(float(row['Correlation']), 3) if not pd.isna(row['Correlation']) else "N/A",
"Parameters": row['Number of Parameters'],
"Embed Dim": int(float(row[embedding_dim_col])) if not pd.isna(row[embedding_dim_col]) else 0,
"Max Sequence Length": "N/A" if pd.isna(row[max_seq_col]) or row[max_seq_col] == "Unknown" else int(float(row[max_seq_col])),
"Vocab Size": int(float(row['Vocab Size'])) if 'Vocab Size' in row and not pd.isna(row['Vocab Size']) else 0
}
data.append(data_row)
result_df = pd.DataFrame(data)
return result_df
def create_excel_like_cmap():
"""Create Excel-like colormap for score visualization"""
colors = [
(0.9, 0.1, 0.2), # Red
(1.0, 1.0, 0.0), # Yellow
(0/255, 176/255, 80/255) # Excel-style Green
]
return LinearSegmentedColormap.from_list("excel_like", colors, N=256)
def rgb_to_hex(rgb_tuple):
"""Convert RGB tuple to hex color"""
r, g, b = [int(x * 255) for x in rgb_tuple[:3]]
return f"#{r:02x}{g:02x}{b:02x}"
def create_colored_cell(value: float, min_val: float, max_val: float, colormap) -> str:
"""Create colored cell HTML for score visualization"""
if pd.isna(value) or value == "N/A":
return str(value)
try:
# Normalize value to 0-1 range
if max_val > min_val:
normalized = (float(value) - min_val) / (max_val - min_val)
else:
normalized = 0.5
# Get color from colormap
color_rgba = colormap(normalized)
color_hex = rgb_to_hex(color_rgba)
# Create colored cell HTML with data-sort attribute for proper numeric sorting
return f'<div style="background-color: {color_hex}; padding: 4px 8px; border-radius: 4px; text-align: center; font-weight: bold; color: #333;" data-sort="{value}">{value}</div>'
except (ValueError, TypeError):
return str(value)
def create_styled_leaderboard_dataframe(df: pd.DataFrame) -> Styler:
"""Create styled leaderboard dataframe with color coding and clickable model names using pandas Styler
Returns a pandas Styler object that Gradio Dataframe can render with both colors AND correct sorting.
"""
if df.empty:
return df.style
colormap = create_excel_like_cmap()
# Score columns to colorize
score_columns = ["Mean (Task)", "Mean (TaskType)", "Classification", "Clustering",
"Pair Classification", "Retrieval", "STS", "Correlation"]
# Calculate min/max for each score column for normalization
color_ranges = {}
for col in score_columns:
if col in df.columns:
numeric_values = pd.to_numeric(df[col], errors='coerce')
if not numeric_values.isna().all():
color_ranges[col] = {
'min': numeric_values.min(),
'max': numeric_values.max()
}
# Create styler with background colors for score columns
def apply_color_gradient(val, col_name):
"""Apply background color based on value"""
if col_name not in color_ranges:
return ''
if pd.isna(val) or val == "N/A":
return ''
try:
min_val = color_ranges[col_name]['min']
max_val = color_ranges[col_name]['max']
# Normalize value to 0-1 range
if max_val > min_val:
normalized = (float(val) - min_val) / (max_val - min_val)
else:
normalized = 0.5
# Get color from colormap
color_rgba = colormap(normalized)
color_hex = rgb_to_hex(color_rgba)
return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
except (ValueError, TypeError):
return ''
# Apply styling to score columns using map (applymap is deprecated)
styler = df.style
for col in score_columns:
if col in df.columns:
styler = styler.map(lambda val, c=col: apply_color_gradient(val, c), subset=[col])
# Format score columns to 2 decimal places
format_dict = {}
for col in score_columns:
if col in df.columns:
format_dict[col] = '{:.2f}'
if format_dict:
styler = styler.format(format_dict, na_rep='N/A')
return styler