File size: 10,129 Bytes
9a235dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 |
#!/usr/bin/env python3
"""
UI Components module for MTEB Turkish Leaderboard - HF Spaces Version
Simplified version with only leaderboard and dataset components
"""
import gradio as gr
import pandas as pd
from data_processor import (create_styled_leaderboard_dataframe,
create_empty_leaderboard_dataframe)
def create_leaderboard_tab(current_data: pd.DataFrame):
"""Create the main leaderboard tab with color styling"""
# Handle empty or invalid data
if current_data.empty or "Model" not in current_data.columns:
print("β οΈ Warning: Empty or invalid data, using empty leaderboard structure")
current_data = create_empty_leaderboard_dataframe()
# Apply color styling to score columns using pandas Styler
styled_data = create_styled_leaderboard_dataframe(current_data)
leaderboard = gr.Dataframe(
value=styled_data,
interactive=False,
wrap=True,
max_height=600,
show_search=True,
datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number", "str", "number", "str", "number"], # Model column as HTML for clickable links
column_widths=["70px", "250px", "130px", "130px", "160px", "130px", "170px", "130px", "100px", "130px", "120px", "120px", "120px", "120px"]
)
# Information about the leaderboard
gr.Markdown("""
### π How to Use the Leaderboard:
- **Search**: Use the search box to find specific models
- **Color Coding**: Scores are color-coded from red (low) to green (high)
- **Sorting**: Click on column headers to sort by different metrics
- **Rankings**: Models are ranked by Mean (Task) score
### π Performance Insights:
- **Top Performers**: Models with Mean (Task) > 65 show strong overall performance
- **Specialized Models**: Some models excel in specific tasks (e.g., retrieval vs classification)
- **Model Size vs Performance**: Larger models generally perform better but with exceptions
""")
return leaderboard
def create_dataset_tab():
"""Create the dataset information tab"""
gr.Markdown("### π MTEB Turkish Dataset Overview")
# Task name to dataset path mapping
task_to_dataset = {
'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
'XQuADRetrieval': 'google/xquad',
'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
'MKQARetrieval': 'apple/mkqa',
'MassiveIntentClassification': 'mteb/amazon_massive_intent',
'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
'SIB200Classification': 'mteb/sib200',
'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
'SIB200ClusteringS2S': 'mteb/sib200',
'XNLI': 'mteb/xnli',
'XNLIV2': 'mteb/xnli2.0-multi-pair',
'STS22.v2': 'mteb/sts22-crosslingual-sts'
}
# Create clickable task names
clickable_task_names = []
for task_name in [
'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
'MassiveIntentClassification', 'MassiveScenarioClassification',
'MultilingualSentimentClassification', 'SIB200Classification',
'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
]:
dataset_path = task_to_dataset[task_name]
hf_link = f"https://huggingface.co/datasets/{dataset_path}"
clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{task_name}</a>'
clickable_task_names.append(clickable_name)
# Create dataset information table
dataset_data = pd.DataFrame({
'Task Name': clickable_task_names,
'Task Type': [
'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
'Classification', 'Classification',
'Classification', 'Classification',
'Classification', 'Classification',
'Clustering', 'PairClassification', 'PairClassification', 'STS'
],
'Description': [
'Turkish FAQ retrieval task',
'Turkish question answering retrieval',
'Historical Turkish document retrieval',
'Multilingual knowledge QA retrieval',
'Intent classification for Turkish',
'Scenario classification for Turkish',
'Multilingual sentiment classification',
'SIB200 language identification',
'Turkish movie review sentiment',
'Turkish product review sentiment',
'SIB200 clustering task',
'Turkish natural language inference',
'Enhanced Turkish NLI task',
'Turkish semantic textual similarity'
],
'Domain': [
'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
'Intent', 'Scenario',
'Sentiment', 'Language ID',
'Movies', 'Products',
'Language ID', 'NLI', 'NLI', 'STS'
],
'Samples': [
'~135K', '~10K', '~1.4K', '~10K',
'~11K', '~11K',
'~4.5K', '~700',
'~8K', '~4.8K',
'~1K', '~1.4K', '~1.4K', '~400'
]
})
dataset_table = gr.Dataframe(
value=dataset_data,
label="MTEB Turkish Task Details",
interactive=False,
wrap=True,
datatype=["html", "str", "str", "str", "str"] # First column (Task Name) as HTML for clickable links
)
# Task type distribution
gr.Markdown("""
### π Task Distribution:
**By Task Type:**
- **Classification**: 6 tasks (sentiment, intent, scenario, language identification)
- **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA)
- **Pair Classification**: 2 tasks (natural language inference)
- **Clustering**: 1 task (language clustering)
- **STS**: 1 task (semantic textual similarity)
**By Domain:**
- **Sentiment Analysis**: Movie and product reviews
- **Question Answering**: FAQ, reading comprehension, and knowledge QA
- **Intent/Scenario**: Conversational AI applications
- **Language Tasks**: NLI, STS, clustering
- **Multilingual**: Cross-lingual evaluation capabilities
""")
# Statistics summary
stats_data = pd.DataFrame({
'Metric': [
'Total Tasks',
'Total Samples',
'Task Types',
'Languages',
'Avg. Tokens per Sample'
],
'Value': [
'14 tasks',
'~190K samples',
'5 types',
'Turkish + Multilingual',
'~150 tokens'
],
'Notes': [
'Comprehensive evaluation across domains',
'Large-scale evaluation dataset',
'Classification, Retrieval, STS, NLI, Clustering',
'Focus on Turkish with multilingual support',
'Varies by task type and domain'
]
})
gr.Dataframe(
value=stats_data,
label="Dataset Statistics Summary",
interactive=False
)
gr.Markdown("""
### π― Evaluation Methodology:
**Scoring:**
- Each task uses task-specific metrics (accuracy, F1, recall@k, etc.)
- **Mean (Task)**: Direct average of all individual task scores
- **Mean (TaskType)**: Average of task category means
- **Individual Categories**: Performance in each task type
**Model Ranking:**
- Primary ranking by **Mean (Task)** score
- Correlation metrics provide additional insights
- Task-specific performance shows model strengths
**Quality Assurance:**
- Standardized evaluation protocols
- Consistent preprocessing across tasks
- Multiple metrics per task for robustness
""")
return dataset_table
def create_submit_evaluation_tab():
"""Create the submit evaluation tab with form"""
gr.Markdown("### π Submit Model for Evaluation")
gr.Markdown("""
Submit your Turkish embedding model for evaluation on the MTEB Turkish benchmark.
**Authentication with Hugging Face is required to submit evaluations.**
""")
# OAuth login button
login_button = gr.LoginButton(value="Sign in with Hugging Face")
model_input = gr.Textbox(
label="π€ Model Name",
placeholder="sentence-transformers/your-model",
info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
)
email_input = gr.Textbox(
label="π§ Email Address",
placeholder="your.email@example.com",
info="Email for notifications about evaluation status and results"
)
submit_btn = gr.Button(
"π Submit",
variant="primary",
size="lg"
)
# Result output for authentication messages
result_output = gr.HTML(label="Status")
# Information about the evaluation process
gr.Markdown("""
### π Evaluation Process:
1. **Sign In**: First, sign in with your Hugging Face account using the button above
2. **Submit Request**: Fill out the form with your model details and email
3. **Admin Review**: Your request will be reviewed by administrators
4. **Evaluation**: If approved, your model will be evaluated on MTEB Turkish benchmark
5. **Results**: You'll receive email notifications and results will appear on the leaderboard
### β οΈ Important Notes:
- **Authentication Required**: You must be logged in with Hugging Face to submit evaluations
- You'll receive email updates about your request status
- Make sure your model is publicly available on HuggingFace
- Valid email address is required for receiving results
""")
return (model_input, email_input, submit_btn, login_button, result_output)
|