Commit
Β·
9a235dc
0
Parent(s):
Initial commit
Browse files- .gitattributes +35 -0
- README.md +22 -0
- api_client.py +103 -0
- app.py +137 -0
- config.py +28 -0
- data_processor.py +208 -0
- evaluation_service.py +190 -0
- leaderboard_data.csv +33 -0
- requirements.txt +7 -0
- ui_components.py +259 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Mizan
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.49.1
|
| 8 |
+
python_version: 3.12
|
| 9 |
+
app_file: app.py
|
| 10 |
+
pinned: false
|
| 11 |
+
short_description: Display benchmark results for embedding models
|
| 12 |
+
license: apache-2.0
|
| 13 |
+
|
| 14 |
+
# OAuth configuration
|
| 15 |
+
hf_oauth: true
|
| 16 |
+
hf_oauth_client_id: "${OAUTH_CLIENT_ID}"
|
| 17 |
+
hf_oauth_client_secret: "${OAUTH_CLIENT_SECRET}"
|
| 18 |
+
hf_oauth_expiration_minutes: 30
|
| 19 |
+
hf_oauth_scopes:
|
| 20 |
+
- email
|
| 21 |
+
|
| 22 |
+
---
|
api_client.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
API Client module for MTEB Turkish Leaderboard
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Optional, Dict, Any
|
| 7 |
+
import traceback
|
| 8 |
+
import requests
|
| 9 |
+
|
| 10 |
+
from config import API_BASE_URL, API_TIMEOUT, API_URL, USERNAME, PASSWORD
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def check_api_health() -> bool:
|
| 14 |
+
"""Check if API is available"""
|
| 15 |
+
try:
|
| 16 |
+
response = requests.get(f"{API_BASE_URL}/api/v1/health", timeout=5)
|
| 17 |
+
return response.status_code == 200
|
| 18 |
+
except:
|
| 19 |
+
return False
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def send_evaluation_request_to_api(model_name: str, batch_size: int = 32, email: str = "user@example.com") -> Optional[Dict[str, Any]]:
|
| 23 |
+
"""
|
| 24 |
+
Send an evaluation request to the API for the specified model.
|
| 25 |
+
Returns the API response as a dictionary if successful, otherwise None.
|
| 26 |
+
"""
|
| 27 |
+
try:
|
| 28 |
+
payload = {
|
| 29 |
+
"model_name": model_name,
|
| 30 |
+
"model_repo": model_name.split("/")[0] if "/" in model_name else "unknown",
|
| 31 |
+
"batch_size": batch_size,
|
| 32 |
+
"email": email,
|
| 33 |
+
"model_type": "sentence-transformer"
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# Authentication credentials
|
| 37 |
+
auth = (USERNAME, PASSWORD)
|
| 38 |
+
|
| 39 |
+
response = requests.post(
|
| 40 |
+
f"{API_URL}/api/mteb/request",
|
| 41 |
+
json=payload,
|
| 42 |
+
timeout=API_TIMEOUT,
|
| 43 |
+
auth=auth
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
print(f"Response Status: {response.status_code}")
|
| 47 |
+
|
| 48 |
+
if response.status_code == 200:
|
| 49 |
+
result = response.json()
|
| 50 |
+
return result
|
| 51 |
+
else:
|
| 52 |
+
print(f"API Error: {response.status_code}")
|
| 53 |
+
try:
|
| 54 |
+
error_detail = response.json()
|
| 55 |
+
print(f" Error Detail: {error_detail}")
|
| 56 |
+
except:
|
| 57 |
+
print(f" Raw Response: {response.text}")
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"API Call Error: {e}")
|
| 62 |
+
traceback.print_exc()
|
| 63 |
+
return None
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_evaluation_status(request_id: str) -> Optional[Dict[str, Any]]:
|
| 67 |
+
"""Get evaluation status from"""
|
| 68 |
+
try:
|
| 69 |
+
auth = (USERNAME, PASSWORD)
|
| 70 |
+
|
| 71 |
+
response = requests.get(
|
| 72 |
+
f"{API_URL}/api/mteb/status/{request_id}",
|
| 73 |
+
timeout=API_TIMEOUT,
|
| 74 |
+
auth=auth
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
if response.status_code == 200:
|
| 78 |
+
return response.json()
|
| 79 |
+
else:
|
| 80 |
+
print(f"Status check error: {response.status_code}")
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"Status check error: {e}")
|
| 85 |
+
return None
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def cancel_evaluation_request(request_id: str) -> bool:
|
| 89 |
+
"""Cancel an evaluation request"""
|
| 90 |
+
try:
|
| 91 |
+
auth = (USERNAME, PASSWORD)
|
| 92 |
+
|
| 93 |
+
response = requests.delete(
|
| 94 |
+
f"{API_URL}/api/mteb/request/{request_id}",
|
| 95 |
+
timeout=API_TIMEOUT,
|
| 96 |
+
auth=auth
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
return response.status_code == 200
|
| 100 |
+
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"Cancel request error: {e}")
|
| 103 |
+
return False
|
app.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Mizan Leaderboard - Enhanced Version with Submit Functionality
|
| 4 |
+
Includes leaderboard display, model submission, and evaluation tracking
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
|
| 9 |
+
from ui_components import (
|
| 10 |
+
create_leaderboard_tab, create_dataset_tab, create_submit_evaluation_tab
|
| 11 |
+
)
|
| 12 |
+
from data_processor import load_leaderboard_from_csv
|
| 13 |
+
from evaluation_service import submit_evaluation
|
| 14 |
+
|
| 15 |
+
# Global data storage
|
| 16 |
+
current_data = None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def create_leaderboard_demo():
|
| 20 |
+
"""Create enhanced leaderboard demo interface with submit functionality"""
|
| 21 |
+
|
| 22 |
+
global current_data
|
| 23 |
+
|
| 24 |
+
# Setup directories
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# Load data from CSV file
|
| 28 |
+
current_data = load_leaderboard_from_csv()
|
| 29 |
+
|
| 30 |
+
with gr.Blocks(
|
| 31 |
+
title="Mizan",
|
| 32 |
+
theme=gr.themes.Soft()
|
| 33 |
+
) as demo:
|
| 34 |
+
|
| 35 |
+
gr.Markdown("""
|
| 36 |
+
# Mizan Leaderboard
|
| 37 |
+
|
| 38 |
+
Performance comparison for Turkish embedding models
|
| 39 |
+
""")
|
| 40 |
+
|
| 41 |
+
with gr.Tabs():
|
| 42 |
+
# Tab 1: Leaderboard
|
| 43 |
+
with gr.Tab("π Leaderboard"):
|
| 44 |
+
leaderboard_table = create_leaderboard_tab(current_data)
|
| 45 |
+
|
| 46 |
+
# Tab 2: Submit
|
| 47 |
+
with gr.Tab("π Submit"):
|
| 48 |
+
(model_input, email_input, submit_btn, login_button, result_output) = create_submit_evaluation_tab()
|
| 49 |
+
|
| 50 |
+
# Submit evaluation functionality with authentication
|
| 51 |
+
def handle_submit_evaluation(model_name, email, profile, progress=gr.Progress()):
|
| 52 |
+
import logging
|
| 53 |
+
|
| 54 |
+
# Authentication check
|
| 55 |
+
if profile is None:
|
| 56 |
+
logging.warning("Unauthorized submission attempt with no profile")
|
| 57 |
+
return "<p style='color: red; font-weight: bold;'>Authentication required. Please log in with your Hugging Face account.</p>"
|
| 58 |
+
|
| 59 |
+
# IMPORTANT: In local development, Gradio returns "Sign in with Hugging Face" string
|
| 60 |
+
# This is NOT a real authentication, just a placeholder for local testing
|
| 61 |
+
if isinstance(profile, str) and profile == "Sign in with Hugging Face":
|
| 62 |
+
# Block submission in local dev with mock auth
|
| 63 |
+
return "<p style='color: orange; font-weight: bold;'>β οΈ HF authentication required.</p>"
|
| 64 |
+
|
| 65 |
+
# Email is required
|
| 66 |
+
if not email or email.strip() == "":
|
| 67 |
+
return "<p style='color: red; font-weight: bold;'>Email address is required to receive benchmark results.</p>"
|
| 68 |
+
|
| 69 |
+
global current_data
|
| 70 |
+
batch_size = 32 # Always use default batch size
|
| 71 |
+
result_msg, updated_data = submit_evaluation(model_name, email, batch_size, current_data, progress)
|
| 72 |
+
# Note: For now, we don't update the leaderboard since evaluation is async
|
| 73 |
+
# The leaderboard will be updated manually when results are available
|
| 74 |
+
logging.info(f"Submission processed for model: {model_name} by user: {profile}")
|
| 75 |
+
return result_msg
|
| 76 |
+
|
| 77 |
+
submit_btn.click(
|
| 78 |
+
fn=handle_submit_evaluation,
|
| 79 |
+
inputs=[model_input, email_input, login_button],
|
| 80 |
+
outputs=[result_output]
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Tab 3: Dataset Information
|
| 84 |
+
with gr.Tab("π Dataset Information"):
|
| 85 |
+
dataset_table = create_dataset_tab()
|
| 86 |
+
gr.Markdown("""
|
| 87 |
+
---
|
| 88 |
+
### π Metrics Explanation:
|
| 89 |
+
- **Mean (Task)**: Average performance across all individual tasks
|
| 90 |
+
- **Mean (TaskType)**: Average performance by task categories
|
| 91 |
+
- **Classification**: Performance on Turkish classification tasks
|
| 92 |
+
- **Clustering**: Performance on Turkish clustering tasks
|
| 93 |
+
- **Pair Classification**: Performance on pair classification tasks (like NLI)
|
| 94 |
+
- **Retrieval**: Performance on information retrieval tasks
|
| 95 |
+
- **STS**: Performance on Semantic Textual Similarity tasks
|
| 96 |
+
- **Correlation**: Weighted average of correlation metrics for NLI and STSB datasets
|
| 97 |
+
- **Parameters**: Number of model parameters
|
| 98 |
+
- **Embed Dim**: Embedding dimension size
|
| 99 |
+
- **Max Seq Length**: Maximum sequence length the model can process (0 = infinite/unlimited)
|
| 100 |
+
- **Vocab Size**: Size of the model's vocabulary
|
| 101 |
+
|
| 102 |
+
### π About Mizan:
|
| 103 |
+
This leaderboard presents results from the **Mizan** benchmark, which evaluates embedding models
|
| 104 |
+
on Turkish language tasks across multiple domains including:
|
| 105 |
+
- Text classification and sentiment analysis
|
| 106 |
+
- Information retrieval and search
|
| 107 |
+
- Semantic textual similarity
|
| 108 |
+
- Text clustering and pair classification
|
| 109 |
+
|
| 110 |
+
### π Submit Your Model:
|
| 111 |
+
Use the **Submit** tab to submit your Turkish embedding model for evaluation.
|
| 112 |
+
Your request will be reviewed by administrators and you'll receive email notifications about the progress.
|
| 113 |
+
|
| 114 |
+
### Contact:
|
| 115 |
+
For any questions or feedback, please contact info@newmind.ai
|
| 116 |
+
|
| 117 |
+
### Links:
|
| 118 |
+
- **GitHub**: [mteb/mteb v1.38.51](https://github.com/embeddings-benchmark/mteb/tree/1.38.51) - Mizan is currently based on MTEB v1.38.51 (MTEB v2.0.0 support coming soon)
|
| 119 |
+
""")
|
| 120 |
+
|
| 121 |
+
return demo
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def main():
|
| 125 |
+
"""Main entry point"""
|
| 126 |
+
print("π Starting Mizan Leaderboard...")
|
| 127 |
+
|
| 128 |
+
demo = create_leaderboard_demo()
|
| 129 |
+
demo.launch(
|
| 130 |
+
server_name="0.0.0.0",
|
| 131 |
+
server_port=7860,
|
| 132 |
+
share=False
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
if __name__ == "__main__":
|
| 137 |
+
main()
|
config.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Configuration module for MTEB Turkish Leaderboard
|
| 4 |
+
Centralizes environment variables and configuration settings
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
# Load environment variables from .env file
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
# API Configuration
|
| 14 |
+
API_URL = os.environ.get("API_URL")
|
| 15 |
+
USERNAME = os.environ.get("API_USERNAME")
|
| 16 |
+
PASSWORD = os.environ.get("API_PASSWORD")
|
| 17 |
+
|
| 18 |
+
# API Configuration (public settings)
|
| 19 |
+
API_BASE_URL = "http://localhost:8000"
|
| 20 |
+
API_TIMEOUT = 30
|
| 21 |
+
|
| 22 |
+
# Polling and refresh intervals (public settings)
|
| 23 |
+
POLL_INTERVAL = 5 # seconds
|
| 24 |
+
LEADERBOARD_REFRESH_INTERVAL = 30 # seconds
|
| 25 |
+
|
| 26 |
+
# CSV file path for leaderboard data
|
| 27 |
+
CSV_FILE_PATH = "leaderboard_data.csv"
|
| 28 |
+
|
data_processor.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Data Processing module for MTEB Turkish Leaderboard - HF Spaces Version
|
| 4 |
+
Simplified version for loading and processing CSV data
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from pandas.io.formats.style import Styler
|
| 10 |
+
from matplotlib.colors import LinearSegmentedColormap
|
| 11 |
+
import html
|
| 12 |
+
|
| 13 |
+
# CSV file path
|
| 14 |
+
CSV_FILE_PATH = "leaderboard_data.csv"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def load_leaderboard_from_csv() -> pd.DataFrame:
|
| 18 |
+
"""Load leaderboard data from CSV file"""
|
| 19 |
+
try:
|
| 20 |
+
if not os.path.exists(CSV_FILE_PATH):
|
| 21 |
+
print(f"β CSV file not found: {CSV_FILE_PATH}")
|
| 22 |
+
return create_empty_leaderboard_dataframe()
|
| 23 |
+
|
| 24 |
+
df = pd.read_csv(CSV_FILE_PATH)
|
| 25 |
+
print(f"β
Loaded {len(df)} records from {CSV_FILE_PATH}")
|
| 26 |
+
|
| 27 |
+
# Convert to leaderboard format
|
| 28 |
+
leaderboard_df = csv_to_leaderboard_format(df)
|
| 29 |
+
|
| 30 |
+
# Sort by Mean (Task) score and add rankings
|
| 31 |
+
leaderboard_df = leaderboard_df.sort_values("Mean (Task)", ascending=False).reset_index(drop=True)
|
| 32 |
+
leaderboard_df["Rank"] = range(1, len(leaderboard_df) + 1)
|
| 33 |
+
|
| 34 |
+
return leaderboard_df
|
| 35 |
+
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"β Error loading CSV: {e}")
|
| 38 |
+
return create_empty_leaderboard_dataframe()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def create_empty_leaderboard_dataframe() -> pd.DataFrame:
|
| 42 |
+
"""Create an empty DataFrame with proper leaderboard column structure"""
|
| 43 |
+
return pd.DataFrame(columns=[
|
| 44 |
+
"Rank",
|
| 45 |
+
"Model",
|
| 46 |
+
"Mean (Task)",
|
| 47 |
+
"Mean (TaskType)",
|
| 48 |
+
"Classification",
|
| 49 |
+
"Clustering",
|
| 50 |
+
"Pair Classification",
|
| 51 |
+
"Retrieval",
|
| 52 |
+
"STS",
|
| 53 |
+
"Correlation",
|
| 54 |
+
"Parameters",
|
| 55 |
+
"Embed Dim",
|
| 56 |
+
"Max Sequence Length",
|
| 57 |
+
"Vocab Size",
|
| 58 |
+
])
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def csv_to_leaderboard_format(df: pd.DataFrame) -> pd.DataFrame:
|
| 62 |
+
"""Convert CSV data to leaderboard format"""
|
| 63 |
+
data = []
|
| 64 |
+
for idx, row in df.iterrows():
|
| 65 |
+
model_name = row['Model']
|
| 66 |
+
|
| 67 |
+
# Prepare model name for display
|
| 68 |
+
model_name_clean = html.escape(model_name)
|
| 69 |
+
|
| 70 |
+
# Create clickable HuggingFace link for model name
|
| 71 |
+
hf_link = f"https://huggingface.co/{model_name_clean}"
|
| 72 |
+
clickable_model = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{model_name_clean}</a>'
|
| 73 |
+
|
| 74 |
+
# Handle different column name variations
|
| 75 |
+
embedding_dim_col = 'Embedding Dim'
|
| 76 |
+
max_seq_col = 'Max Seq Length'
|
| 77 |
+
pair_classification_col = 'Pair Classification'
|
| 78 |
+
|
| 79 |
+
data_row = {
|
| 80 |
+
"Rank": idx + 1, # Initial ranking, will be recalculated
|
| 81 |
+
"Model": clickable_model,
|
| 82 |
+
"Mean (Task)": round(float(row['Mean (Task)']), 2),
|
| 83 |
+
"Mean (TaskType)": round(float(row['Mean (TaskType)']), 2),
|
| 84 |
+
"Classification": round(float(row['Classification']), 2),
|
| 85 |
+
"Clustering": round(float(row['Clustering']), 2),
|
| 86 |
+
"Pair Classification": round(float(row[pair_classification_col]), 2),
|
| 87 |
+
"Retrieval": round(float(row['Retrieval']), 2),
|
| 88 |
+
"STS": round(float(row['STS']), 2),
|
| 89 |
+
"Correlation": round(float(row['Correlation']), 3) if not pd.isna(row['Correlation']) else "N/A",
|
| 90 |
+
"Parameters": row['Number of Parameters'],
|
| 91 |
+
"Embed Dim": int(float(row[embedding_dim_col])) if not pd.isna(row[embedding_dim_col]) else 0,
|
| 92 |
+
"Max Sequence Length": "N/A" if pd.isna(row[max_seq_col]) or row[max_seq_col] == "Unknown" else int(float(row[max_seq_col])),
|
| 93 |
+
"Vocab Size": int(float(row['Vocab Size'])) if 'Vocab Size' in row and not pd.isna(row['Vocab Size']) else 0
|
| 94 |
+
}
|
| 95 |
+
data.append(data_row)
|
| 96 |
+
|
| 97 |
+
result_df = pd.DataFrame(data)
|
| 98 |
+
return result_df
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def create_excel_like_cmap():
|
| 102 |
+
"""Create Excel-like colormap for score visualization"""
|
| 103 |
+
colors = [
|
| 104 |
+
(0.9, 0.1, 0.2), # Red
|
| 105 |
+
(1.0, 1.0, 0.0), # Yellow
|
| 106 |
+
(0/255, 176/255, 80/255) # Excel-style Green
|
| 107 |
+
]
|
| 108 |
+
|
| 109 |
+
return LinearSegmentedColormap.from_list("excel_like", colors, N=256)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def rgb_to_hex(rgb_tuple):
|
| 113 |
+
"""Convert RGB tuple to hex color"""
|
| 114 |
+
r, g, b = [int(x * 255) for x in rgb_tuple[:3]]
|
| 115 |
+
return f"#{r:02x}{g:02x}{b:02x}"
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def create_colored_cell(value: float, min_val: float, max_val: float, colormap) -> str:
|
| 119 |
+
"""Create colored cell HTML for score visualization"""
|
| 120 |
+
if pd.isna(value) or value == "N/A":
|
| 121 |
+
return str(value)
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
# Normalize value to 0-1 range
|
| 125 |
+
if max_val > min_val:
|
| 126 |
+
normalized = (float(value) - min_val) / (max_val - min_val)
|
| 127 |
+
else:
|
| 128 |
+
normalized = 0.5
|
| 129 |
+
|
| 130 |
+
# Get color from colormap
|
| 131 |
+
color_rgba = colormap(normalized)
|
| 132 |
+
color_hex = rgb_to_hex(color_rgba)
|
| 133 |
+
|
| 134 |
+
# Create colored cell HTML with data-sort attribute for proper numeric sorting
|
| 135 |
+
return f'<div style="background-color: {color_hex}; padding: 4px 8px; border-radius: 4px; text-align: center; font-weight: bold; color: #333;" data-sort="{value}">{value}</div>'
|
| 136 |
+
|
| 137 |
+
except (ValueError, TypeError):
|
| 138 |
+
return str(value)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def create_styled_leaderboard_dataframe(df: pd.DataFrame) -> Styler:
|
| 142 |
+
"""Create styled leaderboard dataframe with color coding and clickable model names using pandas Styler
|
| 143 |
+
|
| 144 |
+
Returns a pandas Styler object that Gradio Dataframe can render with both colors AND correct sorting.
|
| 145 |
+
"""
|
| 146 |
+
if df.empty:
|
| 147 |
+
return df.style
|
| 148 |
+
|
| 149 |
+
colormap = create_excel_like_cmap()
|
| 150 |
+
|
| 151 |
+
# Score columns to colorize
|
| 152 |
+
score_columns = ["Mean (Task)", "Mean (TaskType)", "Classification", "Clustering",
|
| 153 |
+
"Pair Classification", "Retrieval", "STS", "Correlation"]
|
| 154 |
+
|
| 155 |
+
# Calculate min/max for each score column for normalization
|
| 156 |
+
color_ranges = {}
|
| 157 |
+
for col in score_columns:
|
| 158 |
+
if col in df.columns:
|
| 159 |
+
numeric_values = pd.to_numeric(df[col], errors='coerce')
|
| 160 |
+
if not numeric_values.isna().all():
|
| 161 |
+
color_ranges[col] = {
|
| 162 |
+
'min': numeric_values.min(),
|
| 163 |
+
'max': numeric_values.max()
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
# Create styler with background colors for score columns
|
| 167 |
+
def apply_color_gradient(val, col_name):
|
| 168 |
+
"""Apply background color based on value"""
|
| 169 |
+
if col_name not in color_ranges:
|
| 170 |
+
return ''
|
| 171 |
+
|
| 172 |
+
if pd.isna(val) or val == "N/A":
|
| 173 |
+
return ''
|
| 174 |
+
|
| 175 |
+
try:
|
| 176 |
+
min_val = color_ranges[col_name]['min']
|
| 177 |
+
max_val = color_ranges[col_name]['max']
|
| 178 |
+
|
| 179 |
+
# Normalize value to 0-1 range
|
| 180 |
+
if max_val > min_val:
|
| 181 |
+
normalized = (float(val) - min_val) / (max_val - min_val)
|
| 182 |
+
else:
|
| 183 |
+
normalized = 0.5
|
| 184 |
+
|
| 185 |
+
# Get color from colormap
|
| 186 |
+
color_rgba = colormap(normalized)
|
| 187 |
+
color_hex = rgb_to_hex(color_rgba)
|
| 188 |
+
|
| 189 |
+
return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
|
| 190 |
+
except (ValueError, TypeError):
|
| 191 |
+
return ''
|
| 192 |
+
|
| 193 |
+
# Apply styling to score columns using map (applymap is deprecated)
|
| 194 |
+
styler = df.style
|
| 195 |
+
for col in score_columns:
|
| 196 |
+
if col in df.columns:
|
| 197 |
+
styler = styler.map(lambda val, c=col: apply_color_gradient(val, c), subset=[col])
|
| 198 |
+
|
| 199 |
+
# Format score columns to 2 decimal places
|
| 200 |
+
format_dict = {}
|
| 201 |
+
for col in score_columns:
|
| 202 |
+
if col in df.columns:
|
| 203 |
+
format_dict[col] = '{:.2f}'
|
| 204 |
+
|
| 205 |
+
if format_dict:
|
| 206 |
+
styler = styler.format(format_dict, na_rep='N/A')
|
| 207 |
+
|
| 208 |
+
return styler
|
evaluation_service.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Evaluation Service module for MTEB Turkish Leaderboard
|
| 4 |
+
Handles evaluation submissions and status tracking
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import time
|
| 8 |
+
import re
|
| 9 |
+
from typing import Optional, Tuple, List
|
| 10 |
+
import traceback
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import gradio as gr
|
| 13 |
+
|
| 14 |
+
from api_client import send_evaluation_request_to_api, get_evaluation_status, cancel_evaluation_request
|
| 15 |
+
|
| 16 |
+
# Global state management for active evaluations
|
| 17 |
+
active_evaluations = {} # request_id -> {"status": str, "model_name": str, "email": str, "start_time": float}
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_active_evaluations_status() -> str:
|
| 21 |
+
"""Show status of active evaluations"""
|
| 22 |
+
if not active_evaluations:
|
| 23 |
+
return "π’ No active evaluation requests"
|
| 24 |
+
|
| 25 |
+
status_lines = []
|
| 26 |
+
for request_id, info in active_evaluations.items():
|
| 27 |
+
model_name = info["model_name"]
|
| 28 |
+
email = info["email"]
|
| 29 |
+
elapsed = int(time.time() - info["start_time"])
|
| 30 |
+
status = info.get("status", "PENDING")
|
| 31 |
+
status_lines.append(f"π {model_name} ({email}) - {request_id} [{status}] ({elapsed}s)")
|
| 32 |
+
|
| 33 |
+
return "\n".join(status_lines)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_active_evaluations_with_cancel_options() -> Tuple[str, List[str]]:
|
| 37 |
+
"""Get active evaluations status and cancellation options"""
|
| 38 |
+
status_text = get_active_evaluations_status()
|
| 39 |
+
|
| 40 |
+
cancel_options = []
|
| 41 |
+
for request_id, info in active_evaluations.items():
|
| 42 |
+
model_name = info["model_name"]
|
| 43 |
+
cancel_options.append(f"{request_id} - {model_name}")
|
| 44 |
+
|
| 45 |
+
return status_text, cancel_options
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def clear_active_evaluations() -> str:
|
| 49 |
+
"""Clear all active evaluations from tracking"""
|
| 50 |
+
global active_evaluations
|
| 51 |
+
count = len(active_evaluations)
|
| 52 |
+
active_evaluations.clear()
|
| 53 |
+
return f"β
Cleared {count} active evaluation(s) from tracking"
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def cancel_active_evaluation(selection: str) -> str:
|
| 57 |
+
"""Cancel a selected active evaluation"""
|
| 58 |
+
if not selection:
|
| 59 |
+
return "β No evaluation selected for cancellation"
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
request_id = selection.split(" - ")[0]
|
| 63 |
+
|
| 64 |
+
if request_id not in active_evaluations:
|
| 65 |
+
return f"β Evaluation {request_id} not found in active evaluations"
|
| 66 |
+
|
| 67 |
+
# Try to cancel via API
|
| 68 |
+
success = cancel_evaluation_request(request_id)
|
| 69 |
+
|
| 70 |
+
if success:
|
| 71 |
+
model_name = active_evaluations[request_id]["model_name"]
|
| 72 |
+
del active_evaluations[request_id]
|
| 73 |
+
return f"β
Successfully cancelled evaluation for {model_name} (ID: {request_id})"
|
| 74 |
+
else:
|
| 75 |
+
return f"β Failed to cancel evaluation {request_id}. Check API connection."
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
return f"β Error cancelling evaluation: {str(e)}"
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _validate_evaluation_request(model_name: str, email: str = None) -> Optional[str]:
|
| 82 |
+
"""Validate evaluation request parameters"""
|
| 83 |
+
# Model name validation
|
| 84 |
+
if not model_name or not model_name.strip():
|
| 85 |
+
return "β Model name cannot be empty!"
|
| 86 |
+
|
| 87 |
+
model_name = model_name.strip()
|
| 88 |
+
|
| 89 |
+
# Check model name length (format: org/model-name)
|
| 90 |
+
if len(model_name) < 3:
|
| 91 |
+
return "β Model name too short!"
|
| 92 |
+
|
| 93 |
+
if len(model_name) > 256:
|
| 94 |
+
return "β Model name too long (maximum 256 characters)!"
|
| 95 |
+
|
| 96 |
+
# Check for valid HuggingFace model name format (must be org/model)
|
| 97 |
+
if '/' not in model_name:
|
| 98 |
+
return "β Invalid model name format! Must include organization (e.g., organization/model-name)"
|
| 99 |
+
|
| 100 |
+
if not re.match(r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$', model_name):
|
| 101 |
+
return "β Invalid model name format! Use format: organization/model-name"
|
| 102 |
+
|
| 103 |
+
# Email validation
|
| 104 |
+
if not email or not email.strip():
|
| 105 |
+
return "β Email address cannot be empty!"
|
| 106 |
+
|
| 107 |
+
email = email.strip()
|
| 108 |
+
|
| 109 |
+
if len(email) > 254:
|
| 110 |
+
return "β Email address too long!"
|
| 111 |
+
|
| 112 |
+
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
| 113 |
+
if not re.match(email_pattern, email):
|
| 114 |
+
return "β Invalid email address format!"
|
| 115 |
+
|
| 116 |
+
return None
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def submit_evaluation(model_name: str, email: str, batch_size: int, current_data: pd.DataFrame, progress=gr.Progress()) -> Tuple[str, Optional[pd.DataFrame]]:
|
| 120 |
+
try:
|
| 121 |
+
# Input validation
|
| 122 |
+
error_msg = _validate_evaluation_request(model_name, email)
|
| 123 |
+
if error_msg:
|
| 124 |
+
return error_msg, None
|
| 125 |
+
|
| 126 |
+
# Show progress
|
| 127 |
+
progress(0.1, desc="Sending evaluation request to API...")
|
| 128 |
+
|
| 129 |
+
# Send request to API - regardless of backend response, show success to user
|
| 130 |
+
api_response = send_evaluation_request_to_api(model_name, batch_size, email)
|
| 131 |
+
|
| 132 |
+
# Always show success message to user
|
| 133 |
+
# Backend errors (like duplicate requests) are handled by API and communicated via email
|
| 134 |
+
progress(1.0, desc="Request submitted successfully!")
|
| 135 |
+
|
| 136 |
+
# Return success message regardless of backend response
|
| 137 |
+
success_msg = f"""
|
| 138 |
+
β
Evaluation request submitted successfully!
|
| 139 |
+
|
| 140 |
+
π€ Model: {model_name}
|
| 141 |
+
π§ Email: {email}
|
| 142 |
+
|
| 143 |
+
π Next Steps:
|
| 144 |
+
β±οΈ Your request will be reviewed by our system
|
| 145 |
+
π§ You will receive email notifications about the status of your evaluation
|
| 146 |
+
π If you've submitted this model before, you'll be notified via email
|
| 147 |
+
|
| 148 |
+
Thank you for contributing to the Mizan Leaderboard!
|
| 149 |
+
"""
|
| 150 |
+
|
| 151 |
+
return success_msg.strip(), current_data
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
# Log error for debugging
|
| 155 |
+
print(f"β Error submitting evaluation: {str(e)}")
|
| 156 |
+
traceback.print_exc()
|
| 157 |
+
|
| 158 |
+
error_msg = f"""
|
| 159 |
+
β Failed to submit evaluation request
|
| 160 |
+
|
| 161 |
+
π€ Model: {model_name}
|
| 162 |
+
π§ Email: {email}
|
| 163 |
+
|
| 164 |
+
β οΈ Error: Unable to connect to the evaluation service.
|
| 165 |
+
|
| 166 |
+
Please try again later or contact support if the problem persists.
|
| 167 |
+
"""
|
| 168 |
+
return error_msg.strip(), None
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def refresh_evaluation_status() -> str:
|
| 172 |
+
"""Refresh status of all active evaluations"""
|
| 173 |
+
if not active_evaluations:
|
| 174 |
+
return "π’ No active evaluations to refresh"
|
| 175 |
+
|
| 176 |
+
updated_count = 0
|
| 177 |
+
for request_id, info in active_evaluations.items():
|
| 178 |
+
try:
|
| 179 |
+
status_data = get_evaluation_status(request_id)
|
| 180 |
+
if status_data and "status" in status_data:
|
| 181 |
+
old_status = info.get("status", "UNKNOWN")
|
| 182 |
+
new_status = status_data["status"]
|
| 183 |
+
if old_status != new_status:
|
| 184 |
+
info["status"] = new_status
|
| 185 |
+
updated_count += 1
|
| 186 |
+
print(f"Status updated for {request_id}: {old_status} -> {new_status}")
|
| 187 |
+
except Exception as e:
|
| 188 |
+
print(f"Error refreshing status for {request_id}: {e}")
|
| 189 |
+
|
| 190 |
+
return f"π Refreshed status for {len(active_evaluations)} evaluation(s). {updated_count} status change(s) detected."
|
leaderboard_data.csv
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Number of Parameters,Embedding Dim,Max Seq Length,Mean (Task),Mean (TaskType),Classification,Clustering,Pair Classification,Retrieval,STS,Correlation,Vocab Size
|
| 2 |
+
BAAI/bge-m3,567M,1024,8192,69.39,63.51,75.68,35.26,78.88,57.89,69.83,0.61,250002
|
| 3 |
+
intfloat/multilingual-e5-large,559M,1024,512,66.61,62.08,71.8,41.2,72.76,57.17,67.49,0.58,250002
|
| 4 |
+
newmindai/TurkEmbed4STS,305M,768,8192,65.66,62.03,69.69,44.29,81.77,47.6,66.79,0.68,250048
|
| 5 |
+
ytu-ce-cosmos/turkish-e5-large,559M,1024,512,64.93,59.73,72.42,38.51,70.86,47.6,69.24,0.56,250002
|
| 6 |
+
intfloat/multilingual-e5-large-instruct,559M,1024,512,64.33,58.57,72.25,33.16,72.92,44.95,69.56,0.57,250002
|
| 7 |
+
nomic-ai/nomic-embed-text-v2-moe,475M,768,512,64.28,60.15,70.07,41.28,63.87,56.4,69.16,0.53,250048
|
| 8 |
+
Alibaba-NLP/gte-multilingual-base,305M,768,32768,63.86,60.04,68.0,39.16,76.0,50.12,66.94,0.62,250048
|
| 9 |
+
sentence-transformers/paraphrase-multilingual-mpnet-base-v2,278M,768,512,63.33,57.63,70.88,41.35,83.6,33.81,58.51,0.65,250002
|
| 10 |
+
newmindai/modernbert-base-tr-uncased-allnli-stsb,134M,768,8192,61.29,54.09,71.47,35.46,82.83,24.81,55.89,0.66,32000
|
| 11 |
+
numind/NuSentiment-multilingual,278M,768,512,60.52,49.65,73.67,14.96,76.89,32.76,49.96,0.52,250002
|
| 12 |
+
newmindai/TurkEmbed4Retrieval,305M,768,512,60.5,58.04,64.78,47.47,64.04,47.82,66.1,0.57,250048
|
| 13 |
+
Qwen/Qwen3-Embedding-0.6B,595M,1024,131072,60.18,56.53,64.68,33.36,66.02,50.06,68.55,0.48,151669
|
| 14 |
+
sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,117M,384,512,59.95,54.8,67.21,42.31,79.3,29.95,55.24,0.6,250037
|
| 15 |
+
newmindai/TurkEmbed4STS-HD,305M,768,8192,59.94,53.06,67.61,34.24,80.08,35.88,47.47,0.65,250048
|
| 16 |
+
emrecan/bert-base-turkish-cased-mean-nli-stsb-tr,110M,768,512,59.92,52.65,68.38,24.61,74.94,39.0,56.3,0.62,32000
|
| 17 |
+
ibm-granite/granite-embedding-278m-multilingual,278M,768,512,55.9,54.48,58.64,41.98,60.13,45.08,66.57,0.41,250002
|
| 18 |
+
newmindai/ModernBERT-tr-uncased-stsb-HD,134M,768,8192,54.51,43.94,67.17,17.96,82.51,16.08,35.98,0.61,32000
|
| 19 |
+
ibm-granite/granite-embedding-107m-multilingual,106M,384,512,52.68,50.72,55.75,34.17,59.86,39.97,63.85,0.38,250002
|
| 20 |
+
minishlab/potion-multilingual-128M,128M,256,N/A,50.39,44.47,58.34,23.47,59.76,30.84,49.93,0.43,500358
|
| 21 |
+
google/embeddinggemma-300m,307M,768,2048,49.08,44.98,55.23,22.84,61.02,26.92,58.91,0.27,262144
|
| 22 |
+
nomic-ai/nomic-embed-text-v1,136M,768,8192,45.12,41.46,48.3,9.45,59.75,32.9,56.88,0.42,30528
|
| 23 |
+
nomic-ai/nomic-embed-text-v1.5,136M,768,8192,44.63,40.04,48.92,9.69,58.53,32.19,50.89,0.41,30528
|
| 24 |
+
mixedbread-ai/mxbai-embed-large-v1,335M,1024,512,44.0,39.23,49.49,15.99,56.66,27.75,46.25,0.37,30522
|
| 25 |
+
sentence-transformers/multi-qa-MiniLM-L6-cos-v1,22M,384,512,38.82,32.39,44.08,5.55,58.29,25.16,28.88,0.34,30522
|
| 26 |
+
boun-tabi-LMG/TURNA,495M,1024,1024,38.36,30.96,47.17,10.26,56.62,13.04,27.73,0.22,32128
|
| 27 |
+
sentence-transformers/all-MiniLM-L12-v2,33M,384,512,38.28,31.13,44.77,7.82,58.2,21.64,23.24,0.36,30522
|
| 28 |
+
nielsr/lilt-xlm-roberta-base,284M,768,512,38.01,29.57,50.1,12.79,55.35,2.45,27.14,0.22,250002
|
| 29 |
+
sentence-transformers/all-MiniLM-L6-v2,22M,384,512,37.95,31.97,44.46,6.58,56.75,16.48,35.55,0.31,30522
|
| 30 |
+
sentence-transformers/all-mpnet-base-v2,109M,768,512,37.21,31.31,43.75,10.56,55.99,15.16,31.08,0.31,30527
|
| 31 |
+
minishlab/potion-base-8M,7M,256,N/A,36.85,30.01,42.51,2.26,57.86,21.75,25.64,0.36,29528
|
| 32 |
+
sentence-transformers/paraphrase-MiniLM-L6-v2,22M,384,512,36.26,28.19,44.02,4.53,56.62,17.47,18.29,0.33,30522
|
| 33 |
+
newmindai/lettucedect-210m-eurobert-tr-v1,211M,768,8192,27.66,21.55,34.32,1.54,52.34,0.22,19.34,0.1,128256
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=5.49.1
|
| 2 |
+
pandas>=2.3.3
|
| 3 |
+
numpy>=2.3.4
|
| 4 |
+
matplotlib>=3.10.7
|
| 5 |
+
requests>=2.32.5
|
| 6 |
+
python-dotenv>=1.1.1
|
| 7 |
+
itsdangerous>=2.2.0
|
ui_components.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
UI Components module for MTEB Turkish Leaderboard - HF Spaces Version
|
| 4 |
+
Simplified version with only leaderboard and dataset components
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from data_processor import (create_styled_leaderboard_dataframe,
|
| 10 |
+
create_empty_leaderboard_dataframe)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def create_leaderboard_tab(current_data: pd.DataFrame):
|
| 14 |
+
"""Create the main leaderboard tab with color styling"""
|
| 15 |
+
|
| 16 |
+
# Handle empty or invalid data
|
| 17 |
+
if current_data.empty or "Model" not in current_data.columns:
|
| 18 |
+
print("β οΈ Warning: Empty or invalid data, using empty leaderboard structure")
|
| 19 |
+
current_data = create_empty_leaderboard_dataframe()
|
| 20 |
+
|
| 21 |
+
# Apply color styling to score columns using pandas Styler
|
| 22 |
+
styled_data = create_styled_leaderboard_dataframe(current_data)
|
| 23 |
+
|
| 24 |
+
leaderboard = gr.Dataframe(
|
| 25 |
+
value=styled_data,
|
| 26 |
+
interactive=False,
|
| 27 |
+
wrap=True,
|
| 28 |
+
max_height=600,
|
| 29 |
+
show_search=True,
|
| 30 |
+
datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number", "str", "number", "str", "number"], # Model column as HTML for clickable links
|
| 31 |
+
column_widths=["70px", "250px", "130px", "130px", "160px", "130px", "170px", "130px", "100px", "130px", "120px", "120px", "120px", "120px"]
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# Information about the leaderboard
|
| 35 |
+
gr.Markdown("""
|
| 36 |
+
### π How to Use the Leaderboard:
|
| 37 |
+
- **Search**: Use the search box to find specific models
|
| 38 |
+
- **Color Coding**: Scores are color-coded from red (low) to green (high)
|
| 39 |
+
- **Sorting**: Click on column headers to sort by different metrics
|
| 40 |
+
- **Rankings**: Models are ranked by Mean (Task) score
|
| 41 |
+
|
| 42 |
+
### π Performance Insights:
|
| 43 |
+
- **Top Performers**: Models with Mean (Task) > 65 show strong overall performance
|
| 44 |
+
- **Specialized Models**: Some models excel in specific tasks (e.g., retrieval vs classification)
|
| 45 |
+
- **Model Size vs Performance**: Larger models generally perform better but with exceptions
|
| 46 |
+
""")
|
| 47 |
+
|
| 48 |
+
return leaderboard
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def create_dataset_tab():
|
| 52 |
+
"""Create the dataset information tab"""
|
| 53 |
+
|
| 54 |
+
gr.Markdown("### π MTEB Turkish Dataset Overview")
|
| 55 |
+
|
| 56 |
+
# Task name to dataset path mapping
|
| 57 |
+
task_to_dataset = {
|
| 58 |
+
'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
|
| 59 |
+
'XQuADRetrieval': 'google/xquad',
|
| 60 |
+
'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
|
| 61 |
+
'MKQARetrieval': 'apple/mkqa',
|
| 62 |
+
'MassiveIntentClassification': 'mteb/amazon_massive_intent',
|
| 63 |
+
'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
|
| 64 |
+
'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
|
| 65 |
+
'SIB200Classification': 'mteb/sib200',
|
| 66 |
+
'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
|
| 67 |
+
'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
|
| 68 |
+
'SIB200ClusteringS2S': 'mteb/sib200',
|
| 69 |
+
'XNLI': 'mteb/xnli',
|
| 70 |
+
'XNLIV2': 'mteb/xnli2.0-multi-pair',
|
| 71 |
+
'STS22.v2': 'mteb/sts22-crosslingual-sts'
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# Create clickable task names
|
| 75 |
+
clickable_task_names = []
|
| 76 |
+
for task_name in [
|
| 77 |
+
'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
|
| 78 |
+
'MassiveIntentClassification', 'MassiveScenarioClassification',
|
| 79 |
+
'MultilingualSentimentClassification', 'SIB200Classification',
|
| 80 |
+
'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
|
| 81 |
+
'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
|
| 82 |
+
]:
|
| 83 |
+
dataset_path = task_to_dataset[task_name]
|
| 84 |
+
hf_link = f"https://huggingface.co/datasets/{dataset_path}"
|
| 85 |
+
clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{task_name}</a>'
|
| 86 |
+
clickable_task_names.append(clickable_name)
|
| 87 |
+
|
| 88 |
+
# Create dataset information table
|
| 89 |
+
dataset_data = pd.DataFrame({
|
| 90 |
+
'Task Name': clickable_task_names,
|
| 91 |
+
'Task Type': [
|
| 92 |
+
'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
|
| 93 |
+
'Classification', 'Classification',
|
| 94 |
+
'Classification', 'Classification',
|
| 95 |
+
'Classification', 'Classification',
|
| 96 |
+
'Clustering', 'PairClassification', 'PairClassification', 'STS'
|
| 97 |
+
],
|
| 98 |
+
'Description': [
|
| 99 |
+
'Turkish FAQ retrieval task',
|
| 100 |
+
'Turkish question answering retrieval',
|
| 101 |
+
'Historical Turkish document retrieval',
|
| 102 |
+
'Multilingual knowledge QA retrieval',
|
| 103 |
+
'Intent classification for Turkish',
|
| 104 |
+
'Scenario classification for Turkish',
|
| 105 |
+
'Multilingual sentiment classification',
|
| 106 |
+
'SIB200 language identification',
|
| 107 |
+
'Turkish movie review sentiment',
|
| 108 |
+
'Turkish product review sentiment',
|
| 109 |
+
'SIB200 clustering task',
|
| 110 |
+
'Turkish natural language inference',
|
| 111 |
+
'Enhanced Turkish NLI task',
|
| 112 |
+
'Turkish semantic textual similarity'
|
| 113 |
+
],
|
| 114 |
+
'Domain': [
|
| 115 |
+
'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
|
| 116 |
+
'Intent', 'Scenario',
|
| 117 |
+
'Sentiment', 'Language ID',
|
| 118 |
+
'Movies', 'Products',
|
| 119 |
+
'Language ID', 'NLI', 'NLI', 'STS'
|
| 120 |
+
],
|
| 121 |
+
'Samples': [
|
| 122 |
+
'~135K', '~10K', '~1.4K', '~10K',
|
| 123 |
+
'~11K', '~11K',
|
| 124 |
+
'~4.5K', '~700',
|
| 125 |
+
'~8K', '~4.8K',
|
| 126 |
+
'~1K', '~1.4K', '~1.4K', '~400'
|
| 127 |
+
]
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
dataset_table = gr.Dataframe(
|
| 131 |
+
value=dataset_data,
|
| 132 |
+
label="MTEB Turkish Task Details",
|
| 133 |
+
interactive=False,
|
| 134 |
+
wrap=True,
|
| 135 |
+
datatype=["html", "str", "str", "str", "str"] # First column (Task Name) as HTML for clickable links
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# Task type distribution
|
| 139 |
+
gr.Markdown("""
|
| 140 |
+
### π Task Distribution:
|
| 141 |
+
|
| 142 |
+
**By Task Type:**
|
| 143 |
+
- **Classification**: 6 tasks (sentiment, intent, scenario, language identification)
|
| 144 |
+
- **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA)
|
| 145 |
+
- **Pair Classification**: 2 tasks (natural language inference)
|
| 146 |
+
- **Clustering**: 1 task (language clustering)
|
| 147 |
+
- **STS**: 1 task (semantic textual similarity)
|
| 148 |
+
|
| 149 |
+
**By Domain:**
|
| 150 |
+
- **Sentiment Analysis**: Movie and product reviews
|
| 151 |
+
- **Question Answering**: FAQ, reading comprehension, and knowledge QA
|
| 152 |
+
- **Intent/Scenario**: Conversational AI applications
|
| 153 |
+
- **Language Tasks**: NLI, STS, clustering
|
| 154 |
+
- **Multilingual**: Cross-lingual evaluation capabilities
|
| 155 |
+
""")
|
| 156 |
+
|
| 157 |
+
# Statistics summary
|
| 158 |
+
stats_data = pd.DataFrame({
|
| 159 |
+
'Metric': [
|
| 160 |
+
'Total Tasks',
|
| 161 |
+
'Total Samples',
|
| 162 |
+
'Task Types',
|
| 163 |
+
'Languages',
|
| 164 |
+
'Avg. Tokens per Sample'
|
| 165 |
+
],
|
| 166 |
+
'Value': [
|
| 167 |
+
'14 tasks',
|
| 168 |
+
'~190K samples',
|
| 169 |
+
'5 types',
|
| 170 |
+
'Turkish + Multilingual',
|
| 171 |
+
'~150 tokens'
|
| 172 |
+
],
|
| 173 |
+
'Notes': [
|
| 174 |
+
'Comprehensive evaluation across domains',
|
| 175 |
+
'Large-scale evaluation dataset',
|
| 176 |
+
'Classification, Retrieval, STS, NLI, Clustering',
|
| 177 |
+
'Focus on Turkish with multilingual support',
|
| 178 |
+
'Varies by task type and domain'
|
| 179 |
+
]
|
| 180 |
+
})
|
| 181 |
+
|
| 182 |
+
gr.Dataframe(
|
| 183 |
+
value=stats_data,
|
| 184 |
+
label="Dataset Statistics Summary",
|
| 185 |
+
interactive=False
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
gr.Markdown("""
|
| 189 |
+
### π― Evaluation Methodology:
|
| 190 |
+
|
| 191 |
+
**Scoring:**
|
| 192 |
+
- Each task uses task-specific metrics (accuracy, F1, recall@k, etc.)
|
| 193 |
+
- **Mean (Task)**: Direct average of all individual task scores
|
| 194 |
+
- **Mean (TaskType)**: Average of task category means
|
| 195 |
+
- **Individual Categories**: Performance in each task type
|
| 196 |
+
|
| 197 |
+
**Model Ranking:**
|
| 198 |
+
- Primary ranking by **Mean (Task)** score
|
| 199 |
+
- Correlation metrics provide additional insights
|
| 200 |
+
- Task-specific performance shows model strengths
|
| 201 |
+
|
| 202 |
+
**Quality Assurance:**
|
| 203 |
+
- Standardized evaluation protocols
|
| 204 |
+
- Consistent preprocessing across tasks
|
| 205 |
+
- Multiple metrics per task for robustness
|
| 206 |
+
""")
|
| 207 |
+
|
| 208 |
+
return dataset_table
|
| 209 |
+
|
| 210 |
+
def create_submit_evaluation_tab():
|
| 211 |
+
"""Create the submit evaluation tab with form"""
|
| 212 |
+
|
| 213 |
+
gr.Markdown("### π Submit Model for Evaluation")
|
| 214 |
+
gr.Markdown("""
|
| 215 |
+
Submit your Turkish embedding model for evaluation on the MTEB Turkish benchmark.
|
| 216 |
+
**Authentication with Hugging Face is required to submit evaluations.**
|
| 217 |
+
""")
|
| 218 |
+
|
| 219 |
+
# OAuth login button
|
| 220 |
+
login_button = gr.LoginButton(value="Sign in with Hugging Face")
|
| 221 |
+
|
| 222 |
+
model_input = gr.Textbox(
|
| 223 |
+
label="π€ Model Name",
|
| 224 |
+
placeholder="sentence-transformers/your-model",
|
| 225 |
+
info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
email_input = gr.Textbox(
|
| 229 |
+
label="π§ Email Address",
|
| 230 |
+
placeholder="your.email@example.com",
|
| 231 |
+
info="Email for notifications about evaluation status and results"
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
submit_btn = gr.Button(
|
| 235 |
+
"π Submit",
|
| 236 |
+
variant="primary",
|
| 237 |
+
size="lg"
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
# Result output for authentication messages
|
| 241 |
+
result_output = gr.HTML(label="Status")
|
| 242 |
+
|
| 243 |
+
# Information about the evaluation process
|
| 244 |
+
gr.Markdown("""
|
| 245 |
+
### π Evaluation Process:
|
| 246 |
+
1. **Sign In**: First, sign in with your Hugging Face account using the button above
|
| 247 |
+
2. **Submit Request**: Fill out the form with your model details and email
|
| 248 |
+
3. **Admin Review**: Your request will be reviewed by administrators
|
| 249 |
+
4. **Evaluation**: If approved, your model will be evaluated on MTEB Turkish benchmark
|
| 250 |
+
5. **Results**: You'll receive email notifications and results will appear on the leaderboard
|
| 251 |
+
|
| 252 |
+
### β οΈ Important Notes:
|
| 253 |
+
- **Authentication Required**: You must be logged in with Hugging Face to submit evaluations
|
| 254 |
+
- You'll receive email updates about your request status
|
| 255 |
+
- Make sure your model is publicly available on HuggingFace
|
| 256 |
+
- Valid email address is required for receiving results
|
| 257 |
+
""")
|
| 258 |
+
|
| 259 |
+
return (model_input, email_input, submit_btn, login_button, result_output)
|