nmmursit commited on
Commit
9a235dc
Β·
0 Parent(s):

Initial commit

Browse files
Files changed (10) hide show
  1. .gitattributes +35 -0
  2. README.md +22 -0
  3. api_client.py +103 -0
  4. app.py +137 -0
  5. config.py +28 -0
  6. data_processor.py +208 -0
  7. evaluation_service.py +190 -0
  8. leaderboard_data.csv +33 -0
  9. requirements.txt +7 -0
  10. ui_components.py +259 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Mizan
3
+ emoji: πŸ“Š
4
+ colorFrom: red
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.49.1
8
+ python_version: 3.12
9
+ app_file: app.py
10
+ pinned: false
11
+ short_description: Display benchmark results for embedding models
12
+ license: apache-2.0
13
+
14
+ # OAuth configuration
15
+ hf_oauth: true
16
+ hf_oauth_client_id: "${OAUTH_CLIENT_ID}"
17
+ hf_oauth_client_secret: "${OAUTH_CLIENT_SECRET}"
18
+ hf_oauth_expiration_minutes: 30
19
+ hf_oauth_scopes:
20
+ - email
21
+
22
+ ---
api_client.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ API Client module for MTEB Turkish Leaderboard
4
+ """
5
+
6
+ from typing import Optional, Dict, Any
7
+ import traceback
8
+ import requests
9
+
10
+ from config import API_BASE_URL, API_TIMEOUT, API_URL, USERNAME, PASSWORD
11
+
12
+
13
+ def check_api_health() -> bool:
14
+ """Check if API is available"""
15
+ try:
16
+ response = requests.get(f"{API_BASE_URL}/api/v1/health", timeout=5)
17
+ return response.status_code == 200
18
+ except:
19
+ return False
20
+
21
+
22
+ def send_evaluation_request_to_api(model_name: str, batch_size: int = 32, email: str = "user@example.com") -> Optional[Dict[str, Any]]:
23
+ """
24
+ Send an evaluation request to the API for the specified model.
25
+ Returns the API response as a dictionary if successful, otherwise None.
26
+ """
27
+ try:
28
+ payload = {
29
+ "model_name": model_name,
30
+ "model_repo": model_name.split("/")[0] if "/" in model_name else "unknown",
31
+ "batch_size": batch_size,
32
+ "email": email,
33
+ "model_type": "sentence-transformer"
34
+ }
35
+
36
+ # Authentication credentials
37
+ auth = (USERNAME, PASSWORD)
38
+
39
+ response = requests.post(
40
+ f"{API_URL}/api/mteb/request",
41
+ json=payload,
42
+ timeout=API_TIMEOUT,
43
+ auth=auth
44
+ )
45
+
46
+ print(f"Response Status: {response.status_code}")
47
+
48
+ if response.status_code == 200:
49
+ result = response.json()
50
+ return result
51
+ else:
52
+ print(f"API Error: {response.status_code}")
53
+ try:
54
+ error_detail = response.json()
55
+ print(f" Error Detail: {error_detail}")
56
+ except:
57
+ print(f" Raw Response: {response.text}")
58
+ return None
59
+
60
+ except Exception as e:
61
+ print(f"API Call Error: {e}")
62
+ traceback.print_exc()
63
+ return None
64
+
65
+
66
+ def get_evaluation_status(request_id: str) -> Optional[Dict[str, Any]]:
67
+ """Get evaluation status from"""
68
+ try:
69
+ auth = (USERNAME, PASSWORD)
70
+
71
+ response = requests.get(
72
+ f"{API_URL}/api/mteb/status/{request_id}",
73
+ timeout=API_TIMEOUT,
74
+ auth=auth
75
+ )
76
+
77
+ if response.status_code == 200:
78
+ return response.json()
79
+ else:
80
+ print(f"Status check error: {response.status_code}")
81
+ return None
82
+
83
+ except Exception as e:
84
+ print(f"Status check error: {e}")
85
+ return None
86
+
87
+
88
+ def cancel_evaluation_request(request_id: str) -> bool:
89
+ """Cancel an evaluation request"""
90
+ try:
91
+ auth = (USERNAME, PASSWORD)
92
+
93
+ response = requests.delete(
94
+ f"{API_URL}/api/mteb/request/{request_id}",
95
+ timeout=API_TIMEOUT,
96
+ auth=auth
97
+ )
98
+
99
+ return response.status_code == 200
100
+
101
+ except Exception as e:
102
+ print(f"Cancel request error: {e}")
103
+ return False
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Mizan Leaderboard - Enhanced Version with Submit Functionality
4
+ Includes leaderboard display, model submission, and evaluation tracking
5
+ """
6
+
7
+ import gradio as gr
8
+
9
+ from ui_components import (
10
+ create_leaderboard_tab, create_dataset_tab, create_submit_evaluation_tab
11
+ )
12
+ from data_processor import load_leaderboard_from_csv
13
+ from evaluation_service import submit_evaluation
14
+
15
+ # Global data storage
16
+ current_data = None
17
+
18
+
19
+ def create_leaderboard_demo():
20
+ """Create enhanced leaderboard demo interface with submit functionality"""
21
+
22
+ global current_data
23
+
24
+ # Setup directories
25
+
26
+
27
+ # Load data from CSV file
28
+ current_data = load_leaderboard_from_csv()
29
+
30
+ with gr.Blocks(
31
+ title="Mizan",
32
+ theme=gr.themes.Soft()
33
+ ) as demo:
34
+
35
+ gr.Markdown("""
36
+ # Mizan Leaderboard
37
+
38
+ Performance comparison for Turkish embedding models
39
+ """)
40
+
41
+ with gr.Tabs():
42
+ # Tab 1: Leaderboard
43
+ with gr.Tab("πŸ“Š Leaderboard"):
44
+ leaderboard_table = create_leaderboard_tab(current_data)
45
+
46
+ # Tab 2: Submit
47
+ with gr.Tab("πŸš€ Submit"):
48
+ (model_input, email_input, submit_btn, login_button, result_output) = create_submit_evaluation_tab()
49
+
50
+ # Submit evaluation functionality with authentication
51
+ def handle_submit_evaluation(model_name, email, profile, progress=gr.Progress()):
52
+ import logging
53
+
54
+ # Authentication check
55
+ if profile is None:
56
+ logging.warning("Unauthorized submission attempt with no profile")
57
+ return "<p style='color: red; font-weight: bold;'>Authentication required. Please log in with your Hugging Face account.</p>"
58
+
59
+ # IMPORTANT: In local development, Gradio returns "Sign in with Hugging Face" string
60
+ # This is NOT a real authentication, just a placeholder for local testing
61
+ if isinstance(profile, str) and profile == "Sign in with Hugging Face":
62
+ # Block submission in local dev with mock auth
63
+ return "<p style='color: orange; font-weight: bold;'>⚠️ HF authentication required.</p>"
64
+
65
+ # Email is required
66
+ if not email or email.strip() == "":
67
+ return "<p style='color: red; font-weight: bold;'>Email address is required to receive benchmark results.</p>"
68
+
69
+ global current_data
70
+ batch_size = 32 # Always use default batch size
71
+ result_msg, updated_data = submit_evaluation(model_name, email, batch_size, current_data, progress)
72
+ # Note: For now, we don't update the leaderboard since evaluation is async
73
+ # The leaderboard will be updated manually when results are available
74
+ logging.info(f"Submission processed for model: {model_name} by user: {profile}")
75
+ return result_msg
76
+
77
+ submit_btn.click(
78
+ fn=handle_submit_evaluation,
79
+ inputs=[model_input, email_input, login_button],
80
+ outputs=[result_output]
81
+ )
82
+
83
+ # Tab 3: Dataset Information
84
+ with gr.Tab("πŸ“Š Dataset Information"):
85
+ dataset_table = create_dataset_tab()
86
+ gr.Markdown("""
87
+ ---
88
+ ### πŸ“Š Metrics Explanation:
89
+ - **Mean (Task)**: Average performance across all individual tasks
90
+ - **Mean (TaskType)**: Average performance by task categories
91
+ - **Classification**: Performance on Turkish classification tasks
92
+ - **Clustering**: Performance on Turkish clustering tasks
93
+ - **Pair Classification**: Performance on pair classification tasks (like NLI)
94
+ - **Retrieval**: Performance on information retrieval tasks
95
+ - **STS**: Performance on Semantic Textual Similarity tasks
96
+ - **Correlation**: Weighted average of correlation metrics for NLI and STSB datasets
97
+ - **Parameters**: Number of model parameters
98
+ - **Embed Dim**: Embedding dimension size
99
+ - **Max Seq Length**: Maximum sequence length the model can process (0 = infinite/unlimited)
100
+ - **Vocab Size**: Size of the model's vocabulary
101
+
102
+ ### πŸ“– About Mizan:
103
+ This leaderboard presents results from the **Mizan** benchmark, which evaluates embedding models
104
+ on Turkish language tasks across multiple domains including:
105
+ - Text classification and sentiment analysis
106
+ - Information retrieval and search
107
+ - Semantic textual similarity
108
+ - Text clustering and pair classification
109
+
110
+ ### πŸš€ Submit Your Model:
111
+ Use the **Submit** tab to submit your Turkish embedding model for evaluation.
112
+ Your request will be reviewed by administrators and you'll receive email notifications about the progress.
113
+
114
+ ### Contact:
115
+ For any questions or feedback, please contact info@newmind.ai
116
+
117
+ ### Links:
118
+ - **GitHub**: [mteb/mteb v1.38.51](https://github.com/embeddings-benchmark/mteb/tree/1.38.51) - Mizan is currently based on MTEB v1.38.51 (MTEB v2.0.0 support coming soon)
119
+ """)
120
+
121
+ return demo
122
+
123
+
124
+ def main():
125
+ """Main entry point"""
126
+ print("πŸš€ Starting Mizan Leaderboard...")
127
+
128
+ demo = create_leaderboard_demo()
129
+ demo.launch(
130
+ server_name="0.0.0.0",
131
+ server_port=7860,
132
+ share=False
133
+ )
134
+
135
+
136
+ if __name__ == "__main__":
137
+ main()
config.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Configuration module for MTEB Turkish Leaderboard
4
+ Centralizes environment variables and configuration settings
5
+ """
6
+
7
+ import os
8
+ from dotenv import load_dotenv
9
+
10
+ # Load environment variables from .env file
11
+ load_dotenv()
12
+
13
+ # API Configuration
14
+ API_URL = os.environ.get("API_URL")
15
+ USERNAME = os.environ.get("API_USERNAME")
16
+ PASSWORD = os.environ.get("API_PASSWORD")
17
+
18
+ # API Configuration (public settings)
19
+ API_BASE_URL = "http://localhost:8000"
20
+ API_TIMEOUT = 30
21
+
22
+ # Polling and refresh intervals (public settings)
23
+ POLL_INTERVAL = 5 # seconds
24
+ LEADERBOARD_REFRESH_INTERVAL = 30 # seconds
25
+
26
+ # CSV file path for leaderboard data
27
+ CSV_FILE_PATH = "leaderboard_data.csv"
28
+
data_processor.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Data Processing module for MTEB Turkish Leaderboard - HF Spaces Version
4
+ Simplified version for loading and processing CSV data
5
+ """
6
+
7
+ import os
8
+ import pandas as pd
9
+ from pandas.io.formats.style import Styler
10
+ from matplotlib.colors import LinearSegmentedColormap
11
+ import html
12
+
13
+ # CSV file path
14
+ CSV_FILE_PATH = "leaderboard_data.csv"
15
+
16
+
17
+ def load_leaderboard_from_csv() -> pd.DataFrame:
18
+ """Load leaderboard data from CSV file"""
19
+ try:
20
+ if not os.path.exists(CSV_FILE_PATH):
21
+ print(f"❌ CSV file not found: {CSV_FILE_PATH}")
22
+ return create_empty_leaderboard_dataframe()
23
+
24
+ df = pd.read_csv(CSV_FILE_PATH)
25
+ print(f"βœ… Loaded {len(df)} records from {CSV_FILE_PATH}")
26
+
27
+ # Convert to leaderboard format
28
+ leaderboard_df = csv_to_leaderboard_format(df)
29
+
30
+ # Sort by Mean (Task) score and add rankings
31
+ leaderboard_df = leaderboard_df.sort_values("Mean (Task)", ascending=False).reset_index(drop=True)
32
+ leaderboard_df["Rank"] = range(1, len(leaderboard_df) + 1)
33
+
34
+ return leaderboard_df
35
+
36
+ except Exception as e:
37
+ print(f"❌ Error loading CSV: {e}")
38
+ return create_empty_leaderboard_dataframe()
39
+
40
+
41
+ def create_empty_leaderboard_dataframe() -> pd.DataFrame:
42
+ """Create an empty DataFrame with proper leaderboard column structure"""
43
+ return pd.DataFrame(columns=[
44
+ "Rank",
45
+ "Model",
46
+ "Mean (Task)",
47
+ "Mean (TaskType)",
48
+ "Classification",
49
+ "Clustering",
50
+ "Pair Classification",
51
+ "Retrieval",
52
+ "STS",
53
+ "Correlation",
54
+ "Parameters",
55
+ "Embed Dim",
56
+ "Max Sequence Length",
57
+ "Vocab Size",
58
+ ])
59
+
60
+
61
+ def csv_to_leaderboard_format(df: pd.DataFrame) -> pd.DataFrame:
62
+ """Convert CSV data to leaderboard format"""
63
+ data = []
64
+ for idx, row in df.iterrows():
65
+ model_name = row['Model']
66
+
67
+ # Prepare model name for display
68
+ model_name_clean = html.escape(model_name)
69
+
70
+ # Create clickable HuggingFace link for model name
71
+ hf_link = f"https://huggingface.co/{model_name_clean}"
72
+ clickable_model = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{model_name_clean}</a>'
73
+
74
+ # Handle different column name variations
75
+ embedding_dim_col = 'Embedding Dim'
76
+ max_seq_col = 'Max Seq Length'
77
+ pair_classification_col = 'Pair Classification'
78
+
79
+ data_row = {
80
+ "Rank": idx + 1, # Initial ranking, will be recalculated
81
+ "Model": clickable_model,
82
+ "Mean (Task)": round(float(row['Mean (Task)']), 2),
83
+ "Mean (TaskType)": round(float(row['Mean (TaskType)']), 2),
84
+ "Classification": round(float(row['Classification']), 2),
85
+ "Clustering": round(float(row['Clustering']), 2),
86
+ "Pair Classification": round(float(row[pair_classification_col]), 2),
87
+ "Retrieval": round(float(row['Retrieval']), 2),
88
+ "STS": round(float(row['STS']), 2),
89
+ "Correlation": round(float(row['Correlation']), 3) if not pd.isna(row['Correlation']) else "N/A",
90
+ "Parameters": row['Number of Parameters'],
91
+ "Embed Dim": int(float(row[embedding_dim_col])) if not pd.isna(row[embedding_dim_col]) else 0,
92
+ "Max Sequence Length": "N/A" if pd.isna(row[max_seq_col]) or row[max_seq_col] == "Unknown" else int(float(row[max_seq_col])),
93
+ "Vocab Size": int(float(row['Vocab Size'])) if 'Vocab Size' in row and not pd.isna(row['Vocab Size']) else 0
94
+ }
95
+ data.append(data_row)
96
+
97
+ result_df = pd.DataFrame(data)
98
+ return result_df
99
+
100
+
101
+ def create_excel_like_cmap():
102
+ """Create Excel-like colormap for score visualization"""
103
+ colors = [
104
+ (0.9, 0.1, 0.2), # Red
105
+ (1.0, 1.0, 0.0), # Yellow
106
+ (0/255, 176/255, 80/255) # Excel-style Green
107
+ ]
108
+
109
+ return LinearSegmentedColormap.from_list("excel_like", colors, N=256)
110
+
111
+
112
+ def rgb_to_hex(rgb_tuple):
113
+ """Convert RGB tuple to hex color"""
114
+ r, g, b = [int(x * 255) for x in rgb_tuple[:3]]
115
+ return f"#{r:02x}{g:02x}{b:02x}"
116
+
117
+
118
+ def create_colored_cell(value: float, min_val: float, max_val: float, colormap) -> str:
119
+ """Create colored cell HTML for score visualization"""
120
+ if pd.isna(value) or value == "N/A":
121
+ return str(value)
122
+
123
+ try:
124
+ # Normalize value to 0-1 range
125
+ if max_val > min_val:
126
+ normalized = (float(value) - min_val) / (max_val - min_val)
127
+ else:
128
+ normalized = 0.5
129
+
130
+ # Get color from colormap
131
+ color_rgba = colormap(normalized)
132
+ color_hex = rgb_to_hex(color_rgba)
133
+
134
+ # Create colored cell HTML with data-sort attribute for proper numeric sorting
135
+ return f'<div style="background-color: {color_hex}; padding: 4px 8px; border-radius: 4px; text-align: center; font-weight: bold; color: #333;" data-sort="{value}">{value}</div>'
136
+
137
+ except (ValueError, TypeError):
138
+ return str(value)
139
+
140
+
141
+ def create_styled_leaderboard_dataframe(df: pd.DataFrame) -> Styler:
142
+ """Create styled leaderboard dataframe with color coding and clickable model names using pandas Styler
143
+
144
+ Returns a pandas Styler object that Gradio Dataframe can render with both colors AND correct sorting.
145
+ """
146
+ if df.empty:
147
+ return df.style
148
+
149
+ colormap = create_excel_like_cmap()
150
+
151
+ # Score columns to colorize
152
+ score_columns = ["Mean (Task)", "Mean (TaskType)", "Classification", "Clustering",
153
+ "Pair Classification", "Retrieval", "STS", "Correlation"]
154
+
155
+ # Calculate min/max for each score column for normalization
156
+ color_ranges = {}
157
+ for col in score_columns:
158
+ if col in df.columns:
159
+ numeric_values = pd.to_numeric(df[col], errors='coerce')
160
+ if not numeric_values.isna().all():
161
+ color_ranges[col] = {
162
+ 'min': numeric_values.min(),
163
+ 'max': numeric_values.max()
164
+ }
165
+
166
+ # Create styler with background colors for score columns
167
+ def apply_color_gradient(val, col_name):
168
+ """Apply background color based on value"""
169
+ if col_name not in color_ranges:
170
+ return ''
171
+
172
+ if pd.isna(val) or val == "N/A":
173
+ return ''
174
+
175
+ try:
176
+ min_val = color_ranges[col_name]['min']
177
+ max_val = color_ranges[col_name]['max']
178
+
179
+ # Normalize value to 0-1 range
180
+ if max_val > min_val:
181
+ normalized = (float(val) - min_val) / (max_val - min_val)
182
+ else:
183
+ normalized = 0.5
184
+
185
+ # Get color from colormap
186
+ color_rgba = colormap(normalized)
187
+ color_hex = rgb_to_hex(color_rgba)
188
+
189
+ return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
190
+ except (ValueError, TypeError):
191
+ return ''
192
+
193
+ # Apply styling to score columns using map (applymap is deprecated)
194
+ styler = df.style
195
+ for col in score_columns:
196
+ if col in df.columns:
197
+ styler = styler.map(lambda val, c=col: apply_color_gradient(val, c), subset=[col])
198
+
199
+ # Format score columns to 2 decimal places
200
+ format_dict = {}
201
+ for col in score_columns:
202
+ if col in df.columns:
203
+ format_dict[col] = '{:.2f}'
204
+
205
+ if format_dict:
206
+ styler = styler.format(format_dict, na_rep='N/A')
207
+
208
+ return styler
evaluation_service.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Evaluation Service module for MTEB Turkish Leaderboard
4
+ Handles evaluation submissions and status tracking
5
+ """
6
+
7
+ import time
8
+ import re
9
+ from typing import Optional, Tuple, List
10
+ import traceback
11
+ import pandas as pd
12
+ import gradio as gr
13
+
14
+ from api_client import send_evaluation_request_to_api, get_evaluation_status, cancel_evaluation_request
15
+
16
+ # Global state management for active evaluations
17
+ active_evaluations = {} # request_id -> {"status": str, "model_name": str, "email": str, "start_time": float}
18
+
19
+
20
+ def get_active_evaluations_status() -> str:
21
+ """Show status of active evaluations"""
22
+ if not active_evaluations:
23
+ return "🟒 No active evaluation requests"
24
+
25
+ status_lines = []
26
+ for request_id, info in active_evaluations.items():
27
+ model_name = info["model_name"]
28
+ email = info["email"]
29
+ elapsed = int(time.time() - info["start_time"])
30
+ status = info.get("status", "PENDING")
31
+ status_lines.append(f"πŸ”„ {model_name} ({email}) - {request_id} [{status}] ({elapsed}s)")
32
+
33
+ return "\n".join(status_lines)
34
+
35
+
36
+ def get_active_evaluations_with_cancel_options() -> Tuple[str, List[str]]:
37
+ """Get active evaluations status and cancellation options"""
38
+ status_text = get_active_evaluations_status()
39
+
40
+ cancel_options = []
41
+ for request_id, info in active_evaluations.items():
42
+ model_name = info["model_name"]
43
+ cancel_options.append(f"{request_id} - {model_name}")
44
+
45
+ return status_text, cancel_options
46
+
47
+
48
+ def clear_active_evaluations() -> str:
49
+ """Clear all active evaluations from tracking"""
50
+ global active_evaluations
51
+ count = len(active_evaluations)
52
+ active_evaluations.clear()
53
+ return f"βœ… Cleared {count} active evaluation(s) from tracking"
54
+
55
+
56
+ def cancel_active_evaluation(selection: str) -> str:
57
+ """Cancel a selected active evaluation"""
58
+ if not selection:
59
+ return "❌ No evaluation selected for cancellation"
60
+
61
+ try:
62
+ request_id = selection.split(" - ")[0]
63
+
64
+ if request_id not in active_evaluations:
65
+ return f"❌ Evaluation {request_id} not found in active evaluations"
66
+
67
+ # Try to cancel via API
68
+ success = cancel_evaluation_request(request_id)
69
+
70
+ if success:
71
+ model_name = active_evaluations[request_id]["model_name"]
72
+ del active_evaluations[request_id]
73
+ return f"βœ… Successfully cancelled evaluation for {model_name} (ID: {request_id})"
74
+ else:
75
+ return f"❌ Failed to cancel evaluation {request_id}. Check API connection."
76
+
77
+ except Exception as e:
78
+ return f"❌ Error cancelling evaluation: {str(e)}"
79
+
80
+
81
+ def _validate_evaluation_request(model_name: str, email: str = None) -> Optional[str]:
82
+ """Validate evaluation request parameters"""
83
+ # Model name validation
84
+ if not model_name or not model_name.strip():
85
+ return "❌ Model name cannot be empty!"
86
+
87
+ model_name = model_name.strip()
88
+
89
+ # Check model name length (format: org/model-name)
90
+ if len(model_name) < 3:
91
+ return "❌ Model name too short!"
92
+
93
+ if len(model_name) > 256:
94
+ return "❌ Model name too long (maximum 256 characters)!"
95
+
96
+ # Check for valid HuggingFace model name format (must be org/model)
97
+ if '/' not in model_name:
98
+ return "❌ Invalid model name format! Must include organization (e.g., organization/model-name)"
99
+
100
+ if not re.match(r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$', model_name):
101
+ return "❌ Invalid model name format! Use format: organization/model-name"
102
+
103
+ # Email validation
104
+ if not email or not email.strip():
105
+ return "❌ Email address cannot be empty!"
106
+
107
+ email = email.strip()
108
+
109
+ if len(email) > 254:
110
+ return "❌ Email address too long!"
111
+
112
+ email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
113
+ if not re.match(email_pattern, email):
114
+ return "❌ Invalid email address format!"
115
+
116
+ return None
117
+
118
+
119
+ def submit_evaluation(model_name: str, email: str, batch_size: int, current_data: pd.DataFrame, progress=gr.Progress()) -> Tuple[str, Optional[pd.DataFrame]]:
120
+ try:
121
+ # Input validation
122
+ error_msg = _validate_evaluation_request(model_name, email)
123
+ if error_msg:
124
+ return error_msg, None
125
+
126
+ # Show progress
127
+ progress(0.1, desc="Sending evaluation request to API...")
128
+
129
+ # Send request to API - regardless of backend response, show success to user
130
+ api_response = send_evaluation_request_to_api(model_name, batch_size, email)
131
+
132
+ # Always show success message to user
133
+ # Backend errors (like duplicate requests) are handled by API and communicated via email
134
+ progress(1.0, desc="Request submitted successfully!")
135
+
136
+ # Return success message regardless of backend response
137
+ success_msg = f"""
138
+ βœ… Evaluation request submitted successfully!
139
+
140
+ πŸ€– Model: {model_name}
141
+ πŸ“§ Email: {email}
142
+
143
+ πŸ“‹ Next Steps:
144
+ ⏱️ Your request will be reviewed by our system
145
+ πŸ“§ You will receive email notifications about the status of your evaluation
146
+ πŸ”„ If you've submitted this model before, you'll be notified via email
147
+
148
+ Thank you for contributing to the Mizan Leaderboard!
149
+ """
150
+
151
+ return success_msg.strip(), current_data
152
+
153
+ except Exception as e:
154
+ # Log error for debugging
155
+ print(f"❌ Error submitting evaluation: {str(e)}")
156
+ traceback.print_exc()
157
+
158
+ error_msg = f"""
159
+ ❌ Failed to submit evaluation request
160
+
161
+ πŸ€– Model: {model_name}
162
+ πŸ“§ Email: {email}
163
+
164
+ ⚠️ Error: Unable to connect to the evaluation service.
165
+
166
+ Please try again later or contact support if the problem persists.
167
+ """
168
+ return error_msg.strip(), None
169
+
170
+
171
+ def refresh_evaluation_status() -> str:
172
+ """Refresh status of all active evaluations"""
173
+ if not active_evaluations:
174
+ return "🟒 No active evaluations to refresh"
175
+
176
+ updated_count = 0
177
+ for request_id, info in active_evaluations.items():
178
+ try:
179
+ status_data = get_evaluation_status(request_id)
180
+ if status_data and "status" in status_data:
181
+ old_status = info.get("status", "UNKNOWN")
182
+ new_status = status_data["status"]
183
+ if old_status != new_status:
184
+ info["status"] = new_status
185
+ updated_count += 1
186
+ print(f"Status updated for {request_id}: {old_status} -> {new_status}")
187
+ except Exception as e:
188
+ print(f"Error refreshing status for {request_id}: {e}")
189
+
190
+ return f"πŸ”„ Refreshed status for {len(active_evaluations)} evaluation(s). {updated_count} status change(s) detected."
leaderboard_data.csv ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Number of Parameters,Embedding Dim,Max Seq Length,Mean (Task),Mean (TaskType),Classification,Clustering,Pair Classification,Retrieval,STS,Correlation,Vocab Size
2
+ BAAI/bge-m3,567M,1024,8192,69.39,63.51,75.68,35.26,78.88,57.89,69.83,0.61,250002
3
+ intfloat/multilingual-e5-large,559M,1024,512,66.61,62.08,71.8,41.2,72.76,57.17,67.49,0.58,250002
4
+ newmindai/TurkEmbed4STS,305M,768,8192,65.66,62.03,69.69,44.29,81.77,47.6,66.79,0.68,250048
5
+ ytu-ce-cosmos/turkish-e5-large,559M,1024,512,64.93,59.73,72.42,38.51,70.86,47.6,69.24,0.56,250002
6
+ intfloat/multilingual-e5-large-instruct,559M,1024,512,64.33,58.57,72.25,33.16,72.92,44.95,69.56,0.57,250002
7
+ nomic-ai/nomic-embed-text-v2-moe,475M,768,512,64.28,60.15,70.07,41.28,63.87,56.4,69.16,0.53,250048
8
+ Alibaba-NLP/gte-multilingual-base,305M,768,32768,63.86,60.04,68.0,39.16,76.0,50.12,66.94,0.62,250048
9
+ sentence-transformers/paraphrase-multilingual-mpnet-base-v2,278M,768,512,63.33,57.63,70.88,41.35,83.6,33.81,58.51,0.65,250002
10
+ newmindai/modernbert-base-tr-uncased-allnli-stsb,134M,768,8192,61.29,54.09,71.47,35.46,82.83,24.81,55.89,0.66,32000
11
+ numind/NuSentiment-multilingual,278M,768,512,60.52,49.65,73.67,14.96,76.89,32.76,49.96,0.52,250002
12
+ newmindai/TurkEmbed4Retrieval,305M,768,512,60.5,58.04,64.78,47.47,64.04,47.82,66.1,0.57,250048
13
+ Qwen/Qwen3-Embedding-0.6B,595M,1024,131072,60.18,56.53,64.68,33.36,66.02,50.06,68.55,0.48,151669
14
+ sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,117M,384,512,59.95,54.8,67.21,42.31,79.3,29.95,55.24,0.6,250037
15
+ newmindai/TurkEmbed4STS-HD,305M,768,8192,59.94,53.06,67.61,34.24,80.08,35.88,47.47,0.65,250048
16
+ emrecan/bert-base-turkish-cased-mean-nli-stsb-tr,110M,768,512,59.92,52.65,68.38,24.61,74.94,39.0,56.3,0.62,32000
17
+ ibm-granite/granite-embedding-278m-multilingual,278M,768,512,55.9,54.48,58.64,41.98,60.13,45.08,66.57,0.41,250002
18
+ newmindai/ModernBERT-tr-uncased-stsb-HD,134M,768,8192,54.51,43.94,67.17,17.96,82.51,16.08,35.98,0.61,32000
19
+ ibm-granite/granite-embedding-107m-multilingual,106M,384,512,52.68,50.72,55.75,34.17,59.86,39.97,63.85,0.38,250002
20
+ minishlab/potion-multilingual-128M,128M,256,N/A,50.39,44.47,58.34,23.47,59.76,30.84,49.93,0.43,500358
21
+ google/embeddinggemma-300m,307M,768,2048,49.08,44.98,55.23,22.84,61.02,26.92,58.91,0.27,262144
22
+ nomic-ai/nomic-embed-text-v1,136M,768,8192,45.12,41.46,48.3,9.45,59.75,32.9,56.88,0.42,30528
23
+ nomic-ai/nomic-embed-text-v1.5,136M,768,8192,44.63,40.04,48.92,9.69,58.53,32.19,50.89,0.41,30528
24
+ mixedbread-ai/mxbai-embed-large-v1,335M,1024,512,44.0,39.23,49.49,15.99,56.66,27.75,46.25,0.37,30522
25
+ sentence-transformers/multi-qa-MiniLM-L6-cos-v1,22M,384,512,38.82,32.39,44.08,5.55,58.29,25.16,28.88,0.34,30522
26
+ boun-tabi-LMG/TURNA,495M,1024,1024,38.36,30.96,47.17,10.26,56.62,13.04,27.73,0.22,32128
27
+ sentence-transformers/all-MiniLM-L12-v2,33M,384,512,38.28,31.13,44.77,7.82,58.2,21.64,23.24,0.36,30522
28
+ nielsr/lilt-xlm-roberta-base,284M,768,512,38.01,29.57,50.1,12.79,55.35,2.45,27.14,0.22,250002
29
+ sentence-transformers/all-MiniLM-L6-v2,22M,384,512,37.95,31.97,44.46,6.58,56.75,16.48,35.55,0.31,30522
30
+ sentence-transformers/all-mpnet-base-v2,109M,768,512,37.21,31.31,43.75,10.56,55.99,15.16,31.08,0.31,30527
31
+ minishlab/potion-base-8M,7M,256,N/A,36.85,30.01,42.51,2.26,57.86,21.75,25.64,0.36,29528
32
+ sentence-transformers/paraphrase-MiniLM-L6-v2,22M,384,512,36.26,28.19,44.02,4.53,56.62,17.47,18.29,0.33,30522
33
+ newmindai/lettucedect-210m-eurobert-tr-v1,211M,768,8192,27.66,21.55,34.32,1.54,52.34,0.22,19.34,0.1,128256
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=5.49.1
2
+ pandas>=2.3.3
3
+ numpy>=2.3.4
4
+ matplotlib>=3.10.7
5
+ requests>=2.32.5
6
+ python-dotenv>=1.1.1
7
+ itsdangerous>=2.2.0
ui_components.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ UI Components module for MTEB Turkish Leaderboard - HF Spaces Version
4
+ Simplified version with only leaderboard and dataset components
5
+ """
6
+
7
+ import gradio as gr
8
+ import pandas as pd
9
+ from data_processor import (create_styled_leaderboard_dataframe,
10
+ create_empty_leaderboard_dataframe)
11
+
12
+
13
+ def create_leaderboard_tab(current_data: pd.DataFrame):
14
+ """Create the main leaderboard tab with color styling"""
15
+
16
+ # Handle empty or invalid data
17
+ if current_data.empty or "Model" not in current_data.columns:
18
+ print("⚠️ Warning: Empty or invalid data, using empty leaderboard structure")
19
+ current_data = create_empty_leaderboard_dataframe()
20
+
21
+ # Apply color styling to score columns using pandas Styler
22
+ styled_data = create_styled_leaderboard_dataframe(current_data)
23
+
24
+ leaderboard = gr.Dataframe(
25
+ value=styled_data,
26
+ interactive=False,
27
+ wrap=True,
28
+ max_height=600,
29
+ show_search=True,
30
+ datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number", "str", "number", "str", "number"], # Model column as HTML for clickable links
31
+ column_widths=["70px", "250px", "130px", "130px", "160px", "130px", "170px", "130px", "100px", "130px", "120px", "120px", "120px", "120px"]
32
+ )
33
+
34
+ # Information about the leaderboard
35
+ gr.Markdown("""
36
+ ### πŸ” How to Use the Leaderboard:
37
+ - **Search**: Use the search box to find specific models
38
+ - **Color Coding**: Scores are color-coded from red (low) to green (high)
39
+ - **Sorting**: Click on column headers to sort by different metrics
40
+ - **Rankings**: Models are ranked by Mean (Task) score
41
+
42
+ ### πŸ“Š Performance Insights:
43
+ - **Top Performers**: Models with Mean (Task) > 65 show strong overall performance
44
+ - **Specialized Models**: Some models excel in specific tasks (e.g., retrieval vs classification)
45
+ - **Model Size vs Performance**: Larger models generally perform better but with exceptions
46
+ """)
47
+
48
+ return leaderboard
49
+
50
+
51
+ def create_dataset_tab():
52
+ """Create the dataset information tab"""
53
+
54
+ gr.Markdown("### πŸ“Š MTEB Turkish Dataset Overview")
55
+
56
+ # Task name to dataset path mapping
57
+ task_to_dataset = {
58
+ 'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
59
+ 'XQuADRetrieval': 'google/xquad',
60
+ 'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
61
+ 'MKQARetrieval': 'apple/mkqa',
62
+ 'MassiveIntentClassification': 'mteb/amazon_massive_intent',
63
+ 'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
64
+ 'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
65
+ 'SIB200Classification': 'mteb/sib200',
66
+ 'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
67
+ 'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
68
+ 'SIB200ClusteringS2S': 'mteb/sib200',
69
+ 'XNLI': 'mteb/xnli',
70
+ 'XNLIV2': 'mteb/xnli2.0-multi-pair',
71
+ 'STS22.v2': 'mteb/sts22-crosslingual-sts'
72
+ }
73
+
74
+ # Create clickable task names
75
+ clickable_task_names = []
76
+ for task_name in [
77
+ 'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
78
+ 'MassiveIntentClassification', 'MassiveScenarioClassification',
79
+ 'MultilingualSentimentClassification', 'SIB200Classification',
80
+ 'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
81
+ 'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
82
+ ]:
83
+ dataset_path = task_to_dataset[task_name]
84
+ hf_link = f"https://huggingface.co/datasets/{dataset_path}"
85
+ clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{task_name}</a>'
86
+ clickable_task_names.append(clickable_name)
87
+
88
+ # Create dataset information table
89
+ dataset_data = pd.DataFrame({
90
+ 'Task Name': clickable_task_names,
91
+ 'Task Type': [
92
+ 'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
93
+ 'Classification', 'Classification',
94
+ 'Classification', 'Classification',
95
+ 'Classification', 'Classification',
96
+ 'Clustering', 'PairClassification', 'PairClassification', 'STS'
97
+ ],
98
+ 'Description': [
99
+ 'Turkish FAQ retrieval task',
100
+ 'Turkish question answering retrieval',
101
+ 'Historical Turkish document retrieval',
102
+ 'Multilingual knowledge QA retrieval',
103
+ 'Intent classification for Turkish',
104
+ 'Scenario classification for Turkish',
105
+ 'Multilingual sentiment classification',
106
+ 'SIB200 language identification',
107
+ 'Turkish movie review sentiment',
108
+ 'Turkish product review sentiment',
109
+ 'SIB200 clustering task',
110
+ 'Turkish natural language inference',
111
+ 'Enhanced Turkish NLI task',
112
+ 'Turkish semantic textual similarity'
113
+ ],
114
+ 'Domain': [
115
+ 'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
116
+ 'Intent', 'Scenario',
117
+ 'Sentiment', 'Language ID',
118
+ 'Movies', 'Products',
119
+ 'Language ID', 'NLI', 'NLI', 'STS'
120
+ ],
121
+ 'Samples': [
122
+ '~135K', '~10K', '~1.4K', '~10K',
123
+ '~11K', '~11K',
124
+ '~4.5K', '~700',
125
+ '~8K', '~4.8K',
126
+ '~1K', '~1.4K', '~1.4K', '~400'
127
+ ]
128
+ })
129
+
130
+ dataset_table = gr.Dataframe(
131
+ value=dataset_data,
132
+ label="MTEB Turkish Task Details",
133
+ interactive=False,
134
+ wrap=True,
135
+ datatype=["html", "str", "str", "str", "str"] # First column (Task Name) as HTML for clickable links
136
+ )
137
+
138
+ # Task type distribution
139
+ gr.Markdown("""
140
+ ### πŸ“ˆ Task Distribution:
141
+
142
+ **By Task Type:**
143
+ - **Classification**: 6 tasks (sentiment, intent, scenario, language identification)
144
+ - **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA)
145
+ - **Pair Classification**: 2 tasks (natural language inference)
146
+ - **Clustering**: 1 task (language clustering)
147
+ - **STS**: 1 task (semantic textual similarity)
148
+
149
+ **By Domain:**
150
+ - **Sentiment Analysis**: Movie and product reviews
151
+ - **Question Answering**: FAQ, reading comprehension, and knowledge QA
152
+ - **Intent/Scenario**: Conversational AI applications
153
+ - **Language Tasks**: NLI, STS, clustering
154
+ - **Multilingual**: Cross-lingual evaluation capabilities
155
+ """)
156
+
157
+ # Statistics summary
158
+ stats_data = pd.DataFrame({
159
+ 'Metric': [
160
+ 'Total Tasks',
161
+ 'Total Samples',
162
+ 'Task Types',
163
+ 'Languages',
164
+ 'Avg. Tokens per Sample'
165
+ ],
166
+ 'Value': [
167
+ '14 tasks',
168
+ '~190K samples',
169
+ '5 types',
170
+ 'Turkish + Multilingual',
171
+ '~150 tokens'
172
+ ],
173
+ 'Notes': [
174
+ 'Comprehensive evaluation across domains',
175
+ 'Large-scale evaluation dataset',
176
+ 'Classification, Retrieval, STS, NLI, Clustering',
177
+ 'Focus on Turkish with multilingual support',
178
+ 'Varies by task type and domain'
179
+ ]
180
+ })
181
+
182
+ gr.Dataframe(
183
+ value=stats_data,
184
+ label="Dataset Statistics Summary",
185
+ interactive=False
186
+ )
187
+
188
+ gr.Markdown("""
189
+ ### 🎯 Evaluation Methodology:
190
+
191
+ **Scoring:**
192
+ - Each task uses task-specific metrics (accuracy, F1, recall@k, etc.)
193
+ - **Mean (Task)**: Direct average of all individual task scores
194
+ - **Mean (TaskType)**: Average of task category means
195
+ - **Individual Categories**: Performance in each task type
196
+
197
+ **Model Ranking:**
198
+ - Primary ranking by **Mean (Task)** score
199
+ - Correlation metrics provide additional insights
200
+ - Task-specific performance shows model strengths
201
+
202
+ **Quality Assurance:**
203
+ - Standardized evaluation protocols
204
+ - Consistent preprocessing across tasks
205
+ - Multiple metrics per task for robustness
206
+ """)
207
+
208
+ return dataset_table
209
+
210
+ def create_submit_evaluation_tab():
211
+ """Create the submit evaluation tab with form"""
212
+
213
+ gr.Markdown("### πŸš€ Submit Model for Evaluation")
214
+ gr.Markdown("""
215
+ Submit your Turkish embedding model for evaluation on the MTEB Turkish benchmark.
216
+ **Authentication with Hugging Face is required to submit evaluations.**
217
+ """)
218
+
219
+ # OAuth login button
220
+ login_button = gr.LoginButton(value="Sign in with Hugging Face")
221
+
222
+ model_input = gr.Textbox(
223
+ label="πŸ€– Model Name",
224
+ placeholder="sentence-transformers/your-model",
225
+ info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
226
+ )
227
+
228
+ email_input = gr.Textbox(
229
+ label="πŸ“§ Email Address",
230
+ placeholder="your.email@example.com",
231
+ info="Email for notifications about evaluation status and results"
232
+ )
233
+
234
+ submit_btn = gr.Button(
235
+ "πŸš€ Submit",
236
+ variant="primary",
237
+ size="lg"
238
+ )
239
+
240
+ # Result output for authentication messages
241
+ result_output = gr.HTML(label="Status")
242
+
243
+ # Information about the evaluation process
244
+ gr.Markdown("""
245
+ ### πŸ“‹ Evaluation Process:
246
+ 1. **Sign In**: First, sign in with your Hugging Face account using the button above
247
+ 2. **Submit Request**: Fill out the form with your model details and email
248
+ 3. **Admin Review**: Your request will be reviewed by administrators
249
+ 4. **Evaluation**: If approved, your model will be evaluated on MTEB Turkish benchmark
250
+ 5. **Results**: You'll receive email notifications and results will appear on the leaderboard
251
+
252
+ ### ⚠️ Important Notes:
253
+ - **Authentication Required**: You must be logged in with Hugging Face to submit evaluations
254
+ - You'll receive email updates about your request status
255
+ - Make sure your model is publicly available on HuggingFace
256
+ - Valid email address is required for receiving results
257
+ """)
258
+
259
+ return (model_input, email_input, submit_btn, login_button, result_output)