Spaces:

newmindai
/

Mizan

Running

App Files Files Community

Mizan / data_processor.py

nmmursit

Initial commit

9a235dc about 2 months ago

raw

history blame

7.62 kB

	#!/usr/bin/env python3
	"""
	Data Processing module for MTEB Turkish Leaderboard - HF Spaces Version
	Simplified version for loading and processing CSV data
	"""

	import os
	import pandas as pd
	from pandas.io.formats.style import Styler
	from matplotlib.colors import LinearSegmentedColormap
	import html

	# CSV file path
	CSV_FILE_PATH = "leaderboard_data.csv"


	def load_leaderboard_from_csv() -> pd.DataFrame:
	"""Load leaderboard data from CSV file"""
	try:
	if not os.path.exists(CSV_FILE_PATH):
	print(f"❌ CSV file not found: {CSV_FILE_PATH}")
	return create_empty_leaderboard_dataframe()

	df = pd.read_csv(CSV_FILE_PATH)
	print(f"✅ Loaded {len(df)} records from {CSV_FILE_PATH}")

	# Convert to leaderboard format
	leaderboard_df = csv_to_leaderboard_format(df)

	# Sort by Mean (Task) score and add rankings
	leaderboard_df = leaderboard_df.sort_values("Mean (Task)", ascending=False).reset_index(drop=True)
	leaderboard_df["Rank"] = range(1, len(leaderboard_df) + 1)

	return leaderboard_df

	except Exception as e:
	print(f"❌ Error loading CSV: {e}")
	return create_empty_leaderboard_dataframe()


	def create_empty_leaderboard_dataframe() -> pd.DataFrame:
	"""Create an empty DataFrame with proper leaderboard column structure"""
	return pd.DataFrame(columns=[
	"Rank",
	"Model",
	"Mean (Task)",
	"Mean (TaskType)",
	"Classification",
	"Clustering",
	"Pair Classification",
	"Retrieval",
	"STS",
	"Correlation",
	"Parameters",
	"Embed Dim",
	"Max Sequence Length",
	"Vocab Size",
	])


	def csv_to_leaderboard_format(df: pd.DataFrame) -> pd.DataFrame:
	"""Convert CSV data to leaderboard format"""
	data = []
	for idx, row in df.iterrows():
	model_name = row['Model']

	# Prepare model name for display
	model_name_clean = html.escape(model_name)

	# Create clickable HuggingFace link for model name
	hf_link = f"https://huggingface.co/{model_name_clean}"
	clickable_model = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{model_name_clean}</a>'

	# Handle different column name variations
	embedding_dim_col = 'Embedding Dim'
	max_seq_col = 'Max Seq Length'
	pair_classification_col = 'Pair Classification'

	data_row = {
	"Rank": idx + 1, # Initial ranking, will be recalculated
	"Model": clickable_model,
	"Mean (Task)": round(float(row['Mean (Task)']), 2),
	"Mean (TaskType)": round(float(row['Mean (TaskType)']), 2),
	"Classification": round(float(row['Classification']), 2),
	"Clustering": round(float(row['Clustering']), 2),
	"Pair Classification": round(float(row[pair_classification_col]), 2),
	"Retrieval": round(float(row['Retrieval']), 2),
	"STS": round(float(row['STS']), 2),
	"Correlation": round(float(row['Correlation']), 3) if not pd.isna(row['Correlation']) else "N/A",
	"Parameters": row['Number of Parameters'],
	"Embed Dim": int(float(row[embedding_dim_col])) if not pd.isna(row[embedding_dim_col]) else 0,
	"Max Sequence Length": "N/A" if pd.isna(row[max_seq_col]) or row[max_seq_col] == "Unknown" else int(float(row[max_seq_col])),
	"Vocab Size": int(float(row['Vocab Size'])) if 'Vocab Size' in row and not pd.isna(row['Vocab Size']) else 0
	}
	data.append(data_row)

	result_df = pd.DataFrame(data)
	return result_df


	def create_excel_like_cmap():
	"""Create Excel-like colormap for score visualization"""
	colors = [
	(0.9, 0.1, 0.2), # Red
	(1.0, 1.0, 0.0), # Yellow
	(0/255, 176/255, 80/255) # Excel-style Green
	]

	return LinearSegmentedColormap.from_list("excel_like", colors, N=256)


	def rgb_to_hex(rgb_tuple):
	"""Convert RGB tuple to hex color"""
	r, g, b = [int(x * 255) for x in rgb_tuple[:3]]
	return f"#{r:02x}{g:02x}{b:02x}"


	def create_colored_cell(value: float, min_val: float, max_val: float, colormap) -> str:
	"""Create colored cell HTML for score visualization"""
	if pd.isna(value) or value == "N/A":
	return str(value)

	try:
	# Normalize value to 0-1 range
	if max_val > min_val:
	normalized = (float(value) - min_val) / (max_val - min_val)
	else:
	normalized = 0.5

	# Get color from colormap
	color_rgba = colormap(normalized)
	color_hex = rgb_to_hex(color_rgba)

	# Create colored cell HTML with data-sort attribute for proper numeric sorting
	return f'<div style="background-color: {color_hex}; padding: 4px 8px; border-radius: 4px; text-align: center; font-weight: bold; color: #333;" data-sort="{value}">{value}</div>'

	except (ValueError, TypeError):
	return str(value)


	def create_styled_leaderboard_dataframe(df: pd.DataFrame) -> Styler:
	"""Create styled leaderboard dataframe with color coding and clickable model names using pandas Styler

	Returns a pandas Styler object that Gradio Dataframe can render with both colors AND correct sorting.
	"""
	if df.empty:
	return df.style

	colormap = create_excel_like_cmap()

	# Score columns to colorize
	score_columns = ["Mean (Task)", "Mean (TaskType)", "Classification", "Clustering",
	"Pair Classification", "Retrieval", "STS", "Correlation"]

	# Calculate min/max for each score column for normalization
	color_ranges = {}
	for col in score_columns:
	if col in df.columns:
	numeric_values = pd.to_numeric(df[col], errors='coerce')
	if not numeric_values.isna().all():
	color_ranges[col] = {
	'min': numeric_values.min(),
	'max': numeric_values.max()
	}

	# Create styler with background colors for score columns
	def apply_color_gradient(val, col_name):
	"""Apply background color based on value"""
	if col_name not in color_ranges:
	return ''

	if pd.isna(val) or val == "N/A":
	return ''

	try:
	min_val = color_ranges[col_name]['min']
	max_val = color_ranges[col_name]['max']

	# Normalize value to 0-1 range
	if max_val > min_val:
	normalized = (float(val) - min_val) / (max_val - min_val)
	else:
	normalized = 0.5

	# Get color from colormap
	color_rgba = colormap(normalized)
	color_hex = rgb_to_hex(color_rgba)

	return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
	except (ValueError, TypeError):
	return ''

	# Apply styling to score columns using map (applymap is deprecated)
	styler = df.style
	for col in score_columns:
	if col in df.columns:
	styler = styler.map(lambda val, c=col: apply_color_gradient(val, c), subset=[col])

	# Format score columns to 2 decimal places
	format_dict = {}
	for col in score_columns:
	if col in df.columns:
	format_dict[col] = '{:.2f}'

	if format_dict:
	styler = styler.format(format_dict, na_rep='N/A')

	return styler