|
|
|
|
|
""" |
|
|
UI Components module for MTEB Turkish Leaderboard - HF Spaces Version |
|
|
Simplified version with only leaderboard and dataset components |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
from data_processor import (create_styled_leaderboard_dataframe, |
|
|
create_empty_leaderboard_dataframe) |
|
|
|
|
|
|
|
|
def create_leaderboard_tab(current_data: pd.DataFrame): |
|
|
"""Create the main leaderboard tab with color styling""" |
|
|
|
|
|
|
|
|
if current_data.empty or "Model" not in current_data.columns: |
|
|
print("β οΈ Warning: Empty or invalid data, using empty leaderboard structure") |
|
|
current_data = create_empty_leaderboard_dataframe() |
|
|
|
|
|
|
|
|
styled_data = create_styled_leaderboard_dataframe(current_data) |
|
|
|
|
|
leaderboard = gr.Dataframe( |
|
|
value=styled_data, |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
max_height=600, |
|
|
show_search=True, |
|
|
datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number", "str", "number", "str", "number"], |
|
|
column_widths=["70px", "250px", "130px", "130px", "160px", "130px", "170px", "130px", "100px", "130px", "120px", "120px", "120px", "120px"] |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
### π How to Use the Leaderboard: |
|
|
- **Search**: Use the search box to find specific models |
|
|
- **Color Coding**: Scores are color-coded from red (low) to green (high) |
|
|
- **Sorting**: Click on column headers to sort by different metrics |
|
|
- **Rankings**: Models are ranked by Mean (Task) score |
|
|
|
|
|
### π Performance Insights: |
|
|
- **Top Performers**: Models with Mean (Task) > 65 show strong overall performance |
|
|
- **Specialized Models**: Some models excel in specific tasks (e.g., retrieval vs classification) |
|
|
- **Model Size vs Performance**: Larger models generally perform better but with exceptions |
|
|
""") |
|
|
|
|
|
return leaderboard |
|
|
|
|
|
|
|
|
def create_dataset_tab(): |
|
|
"""Create the dataset information tab""" |
|
|
|
|
|
gr.Markdown("### π MTEB Turkish Dataset Overview") |
|
|
|
|
|
|
|
|
task_to_dataset = { |
|
|
'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval', |
|
|
'XQuADRetrieval': 'google/xquad', |
|
|
'TurHistQuadRetrieval': 'asparius/TurHistQuAD', |
|
|
'MKQARetrieval': 'apple/mkqa', |
|
|
'MassiveIntentClassification': 'mteb/amazon_massive_intent', |
|
|
'MassiveScenarioClassification': 'mteb/amazon_massive_scenario', |
|
|
'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification', |
|
|
'SIB200Classification': 'mteb/sib200', |
|
|
'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review', |
|
|
'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review', |
|
|
'SIB200ClusteringS2S': 'mteb/sib200', |
|
|
'XNLI': 'mteb/xnli', |
|
|
'XNLIV2': 'mteb/xnli2.0-multi-pair', |
|
|
'STS22.v2': 'mteb/sts22-crosslingual-sts' |
|
|
} |
|
|
|
|
|
|
|
|
clickable_task_names = [] |
|
|
for task_name in [ |
|
|
'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval', |
|
|
'MassiveIntentClassification', 'MassiveScenarioClassification', |
|
|
'MultilingualSentimentClassification', 'SIB200Classification', |
|
|
'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification', |
|
|
'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2' |
|
|
]: |
|
|
dataset_path = task_to_dataset[task_name] |
|
|
hf_link = f"https://huggingface.co/datasets/{dataset_path}" |
|
|
clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{task_name}</a>' |
|
|
clickable_task_names.append(clickable_name) |
|
|
|
|
|
|
|
|
dataset_data = pd.DataFrame({ |
|
|
'Task Name': clickable_task_names, |
|
|
'Task Type': [ |
|
|
'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval', |
|
|
'Classification', 'Classification', |
|
|
'Classification', 'Classification', |
|
|
'Classification', 'Classification', |
|
|
'Clustering', 'PairClassification', 'PairClassification', 'STS' |
|
|
], |
|
|
'Description': [ |
|
|
'Turkish FAQ retrieval task', |
|
|
'Turkish question answering retrieval', |
|
|
'Historical Turkish document retrieval', |
|
|
'Multilingual knowledge QA retrieval', |
|
|
'Intent classification for Turkish', |
|
|
'Scenario classification for Turkish', |
|
|
'Multilingual sentiment classification', |
|
|
'SIB200 language identification', |
|
|
'Turkish movie review sentiment', |
|
|
'Turkish product review sentiment', |
|
|
'SIB200 clustering task', |
|
|
'Turkish natural language inference', |
|
|
'Enhanced Turkish NLI task', |
|
|
'Turkish semantic textual similarity' |
|
|
], |
|
|
'Domain': [ |
|
|
'FAQ/QA', 'QA', 'Historical', 'Knowledge QA', |
|
|
'Intent', 'Scenario', |
|
|
'Sentiment', 'Language ID', |
|
|
'Movies', 'Products', |
|
|
'Language ID', 'NLI', 'NLI', 'STS' |
|
|
], |
|
|
'Samples': [ |
|
|
'~135K', '~10K', '~1.4K', '~10K', |
|
|
'~11K', '~11K', |
|
|
'~4.5K', '~700', |
|
|
'~8K', '~4.8K', |
|
|
'~1K', '~1.4K', '~1.4K', '~400' |
|
|
] |
|
|
}) |
|
|
|
|
|
dataset_table = gr.Dataframe( |
|
|
value=dataset_data, |
|
|
label="MTEB Turkish Task Details", |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
datatype=["html", "str", "str", "str", "str"] |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
### π Task Distribution: |
|
|
|
|
|
**By Task Type:** |
|
|
- **Classification**: 6 tasks (sentiment, intent, scenario, language identification) |
|
|
- **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA) |
|
|
- **Pair Classification**: 2 tasks (natural language inference) |
|
|
- **Clustering**: 1 task (language clustering) |
|
|
- **STS**: 1 task (semantic textual similarity) |
|
|
|
|
|
**By Domain:** |
|
|
- **Sentiment Analysis**: Movie and product reviews |
|
|
- **Question Answering**: FAQ, reading comprehension, and knowledge QA |
|
|
- **Intent/Scenario**: Conversational AI applications |
|
|
- **Language Tasks**: NLI, STS, clustering |
|
|
- **Multilingual**: Cross-lingual evaluation capabilities |
|
|
""") |
|
|
|
|
|
|
|
|
stats_data = pd.DataFrame({ |
|
|
'Metric': [ |
|
|
'Total Tasks', |
|
|
'Total Samples', |
|
|
'Task Types', |
|
|
'Languages', |
|
|
'Avg. Tokens per Sample' |
|
|
], |
|
|
'Value': [ |
|
|
'14 tasks', |
|
|
'~190K samples', |
|
|
'5 types', |
|
|
'Turkish + Multilingual', |
|
|
'~150 tokens' |
|
|
], |
|
|
'Notes': [ |
|
|
'Comprehensive evaluation across domains', |
|
|
'Large-scale evaluation dataset', |
|
|
'Classification, Retrieval, STS, NLI, Clustering', |
|
|
'Focus on Turkish with multilingual support', |
|
|
'Varies by task type and domain' |
|
|
] |
|
|
}) |
|
|
|
|
|
gr.Dataframe( |
|
|
value=stats_data, |
|
|
label="Dataset Statistics Summary", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
### π― Evaluation Methodology: |
|
|
|
|
|
**Scoring:** |
|
|
- Each task uses task-specific metrics (accuracy, F1, recall@k, etc.) |
|
|
- **Mean (Task)**: Direct average of all individual task scores |
|
|
- **Mean (TaskType)**: Average of task category means |
|
|
- **Individual Categories**: Performance in each task type |
|
|
|
|
|
**Model Ranking:** |
|
|
- Primary ranking by **Mean (Task)** score |
|
|
- Correlation metrics provide additional insights |
|
|
- Task-specific performance shows model strengths |
|
|
|
|
|
**Quality Assurance:** |
|
|
- Standardized evaluation protocols |
|
|
- Consistent preprocessing across tasks |
|
|
- Multiple metrics per task for robustness |
|
|
""") |
|
|
|
|
|
return dataset_table |
|
|
|
|
|
def create_submit_evaluation_tab(): |
|
|
"""Create the submit evaluation tab with form""" |
|
|
|
|
|
gr.Markdown("### π Submit Model for Evaluation") |
|
|
gr.Markdown(""" |
|
|
Submit your Turkish embedding model for evaluation on the MTEB Turkish benchmark. |
|
|
**Authentication with Hugging Face is required to submit evaluations.** |
|
|
""") |
|
|
|
|
|
|
|
|
login_button = gr.LoginButton(value="Sign in with Hugging Face") |
|
|
|
|
|
model_input = gr.Textbox( |
|
|
label="π€ Model Name", |
|
|
placeholder="sentence-transformers/your-model", |
|
|
info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)" |
|
|
) |
|
|
|
|
|
email_input = gr.Textbox( |
|
|
label="π§ Email Address", |
|
|
placeholder="your.email@example.com", |
|
|
info="Email for notifications about evaluation status and results" |
|
|
) |
|
|
|
|
|
submit_btn = gr.Button( |
|
|
"π Submit", |
|
|
variant="primary", |
|
|
size="lg" |
|
|
) |
|
|
|
|
|
|
|
|
result_output = gr.HTML(label="Status") |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
### π Evaluation Process: |
|
|
1. **Sign In**: First, sign in with your Hugging Face account using the button above |
|
|
2. **Submit Request**: Fill out the form with your model details and email |
|
|
3. **Admin Review**: Your request will be reviewed by administrators |
|
|
4. **Evaluation**: If approved, your model will be evaluated on MTEB Turkish benchmark |
|
|
5. **Results**: You'll receive email notifications and results will appear on the leaderboard |
|
|
|
|
|
### β οΈ Important Notes: |
|
|
- **Authentication Required**: You must be logged in with Hugging Face to submit evaluations |
|
|
- You'll receive email updates about your request status |
|
|
- Make sure your model is publicly available on HuggingFace |
|
|
- Valid email address is required for receiving results |
|
|
""") |
|
|
|
|
|
return (model_input, email_input, submit_btn, login_button, result_output) |
|
|
|