Spaces:

newmindai
/

Mizan

Running

App Files Files Community

Mizan / ui_components.py

nmmursit

Initial commit

9a235dc about 2 months ago

raw

history blame

10.1 kB

	#!/usr/bin/env python3
	"""
	UI Components module for MTEB Turkish Leaderboard - HF Spaces Version
	Simplified version with only leaderboard and dataset components
	"""

	import gradio as gr
	import pandas as pd
	from data_processor import (create_styled_leaderboard_dataframe,
	create_empty_leaderboard_dataframe)


	def create_leaderboard_tab(current_data: pd.DataFrame):
	"""Create the main leaderboard tab with color styling"""

	# Handle empty or invalid data
	if current_data.empty or "Model" not in current_data.columns:
	print("⚠️ Warning: Empty or invalid data, using empty leaderboard structure")
	current_data = create_empty_leaderboard_dataframe()

	# Apply color styling to score columns using pandas Styler
	styled_data = create_styled_leaderboard_dataframe(current_data)

	leaderboard = gr.Dataframe(
	value=styled_data,
	interactive=False,
	wrap=True,
	max_height=600,
	show_search=True,
	datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number", "str", "number", "str", "number"], # Model column as HTML for clickable links
	column_widths=["70px", "250px", "130px", "130px", "160px", "130px", "170px", "130px", "100px", "130px", "120px", "120px", "120px", "120px"]
	)

	# Information about the leaderboard
	gr.Markdown("""
	### 🔍 How to Use the Leaderboard:
	- Search: Use the search box to find specific models
	- Color Coding: Scores are color-coded from red (low) to green (high)
	- Sorting: Click on column headers to sort by different metrics
	- Rankings: Models are ranked by Mean (Task) score

	### 📊 Performance Insights:
	- Top Performers: Models with Mean (Task) > 65 show strong overall performance
	- Specialized Models: Some models excel in specific tasks (e.g., retrieval vs classification)
	- Model Size vs Performance: Larger models generally perform better but with exceptions
	""")

	return leaderboard


	def create_dataset_tab():
	"""Create the dataset information tab"""

	gr.Markdown("### 📊 MTEB Turkish Dataset Overview")

	# Task name to dataset path mapping
	task_to_dataset = {
	'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
	'XQuADRetrieval': 'google/xquad',
	'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
	'MKQARetrieval': 'apple/mkqa',
	'MassiveIntentClassification': 'mteb/amazon_massive_intent',
	'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
	'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
	'SIB200Classification': 'mteb/sib200',
	'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
	'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
	'SIB200ClusteringS2S': 'mteb/sib200',
	'XNLI': 'mteb/xnli',
	'XNLIV2': 'mteb/xnli2.0-multi-pair',
	'STS22.v2': 'mteb/sts22-crosslingual-sts'
	}

	# Create clickable task names
	clickable_task_names = []
	for task_name in [
	'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
	'MassiveIntentClassification', 'MassiveScenarioClassification',
	'MultilingualSentimentClassification', 'SIB200Classification',
	'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
	'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
	]:
	dataset_path = task_to_dataset[task_name]
	hf_link = f"https://huggingface.co/datasets/{dataset_path}"
	clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{task_name}</a>'
	clickable_task_names.append(clickable_name)

	# Create dataset information table
	dataset_data = pd.DataFrame({
	'Task Name': clickable_task_names,
	'Task Type': [
	'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
	'Classification', 'Classification',
	'Classification', 'Classification',
	'Classification', 'Classification',
	'Clustering', 'PairClassification', 'PairClassification', 'STS'
	],
	'Description': [
	'Turkish FAQ retrieval task',
	'Turkish question answering retrieval',
	'Historical Turkish document retrieval',
	'Multilingual knowledge QA retrieval',
	'Intent classification for Turkish',
	'Scenario classification for Turkish',
	'Multilingual sentiment classification',
	'SIB200 language identification',
	'Turkish movie review sentiment',
	'Turkish product review sentiment',
	'SIB200 clustering task',
	'Turkish natural language inference',
	'Enhanced Turkish NLI task',
	'Turkish semantic textual similarity'
	],
	'Domain': [
	'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
	'Intent', 'Scenario',
	'Sentiment', 'Language ID',
	'Movies', 'Products',
	'Language ID', 'NLI', 'NLI', 'STS'
	],
	'Samples': [
	'~135K', '~10K', '~1.4K', '~10K',
	'~11K', '~11K',
	'~4.5K', '~700',
	'~8K', '~4.8K',
	'~1K', '~1.4K', '~1.4K', '~400'
	]
	})

	dataset_table = gr.Dataframe(
	value=dataset_data,
	label="MTEB Turkish Task Details",
	interactive=False,
	wrap=True,
	datatype=["html", "str", "str", "str", "str"] # First column (Task Name) as HTML for clickable links
	)

	# Task type distribution
	gr.Markdown("""
	### 📈 Task Distribution:

	By Task Type:
	- Classification: 6 tasks (sentiment, intent, scenario, language identification)
	- Retrieval: 4 tasks (FAQ, QA, historical documents, knowledge QA)
	- Pair Classification: 2 tasks (natural language inference)
	- Clustering: 1 task (language clustering)
	- STS: 1 task (semantic textual similarity)

	By Domain:
	- Sentiment Analysis: Movie and product reviews
	- Question Answering: FAQ, reading comprehension, and knowledge QA
	- Intent/Scenario: Conversational AI applications
	- Language Tasks: NLI, STS, clustering
	- Multilingual: Cross-lingual evaluation capabilities
	""")

	# Statistics summary
	stats_data = pd.DataFrame({
	'Metric': [
	'Total Tasks',
	'Total Samples',
	'Task Types',
	'Languages',
	'Avg. Tokens per Sample'
	],
	'Value': [
	'14 tasks',
	'~190K samples',
	'5 types',
	'Turkish + Multilingual',
	'~150 tokens'
	],
	'Notes': [
	'Comprehensive evaluation across domains',
	'Large-scale evaluation dataset',
	'Classification, Retrieval, STS, NLI, Clustering',
	'Focus on Turkish with multilingual support',
	'Varies by task type and domain'
	]
	})

	gr.Dataframe(
	value=stats_data,
	label="Dataset Statistics Summary",
	interactive=False
	)

	gr.Markdown("""
	### 🎯 Evaluation Methodology:

	Scoring:
	- Each task uses task-specific metrics (accuracy, F1, recall@k, etc.)
	- Mean (Task): Direct average of all individual task scores
	- Mean (TaskType): Average of task category means
	- Individual Categories: Performance in each task type

	Model Ranking:
	- Primary ranking by Mean (Task) score
	- Correlation metrics provide additional insights
	- Task-specific performance shows model strengths

	Quality Assurance:
	- Standardized evaluation protocols
	- Consistent preprocessing across tasks
	- Multiple metrics per task for robustness
	""")

	return dataset_table

	def create_submit_evaluation_tab():
	"""Create the submit evaluation tab with form"""

	gr.Markdown("### 🚀 Submit Model for Evaluation")
	gr.Markdown("""
	Submit your Turkish embedding model for evaluation on the MTEB Turkish benchmark.
	Authentication with Hugging Face is required to submit evaluations.
	""")

	# OAuth login button
	login_button = gr.LoginButton(value="Sign in with Hugging Face")

	model_input = gr.Textbox(
	label="🤖 Model Name",
	placeholder="sentence-transformers/your-model",
	info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
	)

	email_input = gr.Textbox(
	label="📧 Email Address",
	placeholder="your.email@example.com",
	info="Email for notifications about evaluation status and results"
	)

	submit_btn = gr.Button(
	"🚀 Submit",
	variant="primary",
	size="lg"
	)

	# Result output for authentication messages
	result_output = gr.HTML(label="Status")

	# Information about the evaluation process
	gr.Markdown("""
	### 📋 Evaluation Process:
	1. Sign In: First, sign in with your Hugging Face account using the button above
	2. Submit Request: Fill out the form with your model details and email
	3. Admin Review: Your request will be reviewed by administrators
	4. Evaluation: If approved, your model will be evaluated on MTEB Turkish benchmark
	5. Results: You'll receive email notifications and results will appear on the leaderboard

	### ⚠️ Important Notes:
	- Authentication Required: You must be logged in with Hugging Face to submit evaluations
	- You'll receive email updates about your request status
	- Make sure your model is publicly available on HuggingFace
	- Valid email address is required for receiving results
	""")

	return (model_input, email_input, submit_btn, login_button, result_output)