Spaces:

newmindai
/

Mizan

Running

App Files Files Community

nmmursit commited on Oct 23

Commit

9a235dc

0 Parent(s):

Initial commit

Browse files

Files changed (10) hide show

.gitattributes +35 -0
README.md +22 -0
api_client.py +103 -0
app.py +137 -0
config.py +28 -0
data_processor.py +208 -0
evaluation_service.py +190 -0
leaderboard_data.csv +33 -0
requirements.txt +7 -0
ui_components.py +259 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+---
+title: Mizan
+emoji: 📊
+colorFrom: red
+colorTo: blue
+sdk: gradio
+sdk_version: 5.49.1
+python_version: 3.12
+app_file: app.py
+pinned: false
+short_description: Display benchmark results for embedding models
+license: apache-2.0
+# OAuth configuration
+hf_oauth: true
+hf_oauth_client_id: "${OAUTH_CLIENT_ID}"
+hf_oauth_client_secret: "${OAUTH_CLIENT_SECRET}"
+hf_oauth_expiration_minutes: 30
+hf_oauth_scopes:
+  - email
+---

api_client.py ADDED Viewed

	@@ -0,0 +1,103 @@

+#!/usr/bin/env python3
+"""
+API Client module for MTEB Turkish Leaderboard
+"""
+from typing import Optional, Dict, Any
+import traceback
+import requests
+from config import API_BASE_URL, API_TIMEOUT, API_URL, USERNAME, PASSWORD
+def check_api_health() -> bool:
+    """Check if API is available"""
+    try:
+        response = requests.get(f"{API_BASE_URL}/api/v1/health", timeout=5)
+        return response.status_code == 200
+    except:
+        return False
+def send_evaluation_request_to_api(model_name: str, batch_size: int = 32, email: str = "user@example.com") -> Optional[Dict[str, Any]]:
+    """
+    Send an evaluation request to the API for the specified model.
+    Returns the API response as a dictionary if successful, otherwise None.
+    """
+    try:
+        payload = {
+            "model_name": model_name,
+            "model_repo": model_name.split("/")[0] if "/" in model_name else "unknown",
+            "batch_size": batch_size,
+            "email": email,
+            "model_type": "sentence-transformer"
+        }
+        # Authentication credentials
+        auth = (USERNAME, PASSWORD)
+        response = requests.post(
+            f"{API_URL}/api/mteb/request",
+            json=payload,
+            timeout=API_TIMEOUT,
+            auth=auth
+        )
+        print(f"Response Status: {response.status_code}")
+        if response.status_code == 200:
+            result = response.json()
+            return result
+        else:
+            print(f"API Error: {response.status_code}")
+            try:
+                error_detail = response.json()
+                print(f"   Error Detail: {error_detail}")
+            except:
+                print(f"   Raw Response: {response.text}")
+            return None
+    except Exception as e:
+        print(f"API Call Error: {e}")
+        traceback.print_exc()
+        return None
+def get_evaluation_status(request_id: str) -> Optional[Dict[str, Any]]:
+    """Get evaluation status from"""
+    try:
+        auth = (USERNAME, PASSWORD)
+        response = requests.get(
+            f"{API_URL}/api/mteb/status/{request_id}",
+            timeout=API_TIMEOUT,
+            auth=auth
+        )
+        if response.status_code == 200:
+            return response.json()
+        else:
+            print(f"Status check error: {response.status_code}")
+            return None
+    except Exception as e:
+        print(f"Status check error: {e}")
+        return None
+def cancel_evaluation_request(request_id: str) -> bool:
+    """Cancel an evaluation request"""
+    try:
+        auth = (USERNAME, PASSWORD)
+        response = requests.delete(
+            f"{API_URL}/api/mteb/request/{request_id}",
+            timeout=API_TIMEOUT,
+            auth=auth
+        )
+        return response.status_code == 200
+    except Exception as e:
+        print(f"Cancel request error: {e}")
+        return False

app.py ADDED Viewed

	@@ -0,0 +1,137 @@

+#!/usr/bin/env python3
+"""
+Mizan Leaderboard - Enhanced Version with Submit Functionality
+Includes leaderboard display, model submission, and evaluation tracking
+"""
+import gradio as gr
+from ui_components import (
+    create_leaderboard_tab, create_dataset_tab, create_submit_evaluation_tab
+    )
+from data_processor import load_leaderboard_from_csv
+from evaluation_service import submit_evaluation
+# Global data storage
+current_data = None
+def create_leaderboard_demo():
+    """Create enhanced leaderboard demo interface with submit functionality"""
+    global current_data
+    # Setup directories
+    # Load data from CSV file
+    current_data = load_leaderboard_from_csv()
+    with gr.Blocks(
+        title="Mizan",
+        theme=gr.themes.Soft()
+    ) as demo:
+        gr.Markdown("""
+        # Mizan Leaderboard
+        Performance comparison for Turkish embedding models
+        """)
+        with gr.Tabs():
+            # Tab 1: Leaderboard
+            with gr.Tab("📊 Leaderboard"):
+                leaderboard_table = create_leaderboard_tab(current_data)
+                        # Tab 2: Submit
+            with gr.Tab("🚀 Submit"):
+                (model_input, email_input, submit_btn, login_button, result_output) = create_submit_evaluation_tab()
+                # Submit evaluation functionality with authentication
+                def handle_submit_evaluation(model_name, email, profile, progress=gr.Progress()):
+                    import logging
+                    # Authentication check
+                    if profile is None:
+                        logging.warning("Unauthorized submission attempt with no profile")
+                        return "<p style='color: red; font-weight: bold;'>Authentication required. Please log in with your Hugging Face account.</p>"
+                    # IMPORTANT: In local development, Gradio returns "Sign in with Hugging Face" string
+                    # This is NOT a real authentication, just a placeholder for local testing
+                    if isinstance(profile, str) and profile == "Sign in with Hugging Face":
+                        # Block submission in local dev with mock auth
+                        return "<p style='color: orange; font-weight: bold;'>⚠️ HF authentication required.</p>"
+                    # Email is required
+                    if not email or email.strip() == "":
+                        return "<p style='color: red; font-weight: bold;'>Email address is required to receive benchmark results.</p>"
+                    global current_data
+                    batch_size = 32  # Always use default batch size
+                    result_msg, updated_data = submit_evaluation(model_name, email, batch_size, current_data, progress)
+                    # Note: For now, we don't update the leaderboard since evaluation is async
+                    # The leaderboard will be updated manually when results are available
+                    logging.info(f"Submission processed for model: {model_name} by user: {profile}")
+                    return result_msg
+                submit_btn.click(
+                    fn=handle_submit_evaluation,
+                    inputs=[model_input, email_input, login_button],
+                    outputs=[result_output]
+                )
+            # Tab 3: Dataset Information
+            with gr.Tab("📊 Dataset Information"):
+                dataset_table = create_dataset_tab()
+                gr.Markdown("""
+        ---
+        ### 📊 Metrics Explanation:
+        - **Mean (Task)**: Average performance across all individual tasks
+        - **Mean (TaskType)**: Average performance by task categories
+        - **Classification**: Performance on Turkish classification tasks
+        - **Clustering**: Performance on Turkish clustering tasks
+        - **Pair Classification**: Performance on pair classification tasks (like NLI)
+        - **Retrieval**: Performance on information retrieval tasks
+        - **STS**: Performance on Semantic Textual Similarity tasks
+        - **Correlation**: Weighted average of correlation metrics for NLI and STSB datasets
+        - **Parameters**: Number of model parameters
+        - **Embed Dim**: Embedding dimension size
+        - **Max Seq Length**: Maximum sequence length the model can process (0 = infinite/unlimited)
+        - **Vocab Size**: Size of the model's vocabulary
+        ### 📖 About Mizan:
+        This leaderboard presents results from the **Mizan** benchmark, which evaluates embedding models
+        on Turkish language tasks across multiple domains including:
+        - Text classification and sentiment analysis
+        - Information retrieval and search
+        - Semantic textual similarity
+        - Text clustering and pair classification
+        ### 🚀 Submit Your Model:
+        Use the **Submit** tab to submit your Turkish embedding model for evaluation.
+        Your request will be reviewed by administrators and you'll receive email notifications about the progress.
+        ### Contact:
+        For any questions or feedback, please contact info@newmind.ai
+        ### Links:
+        - **GitHub**: [mteb/mteb v1.38.51](https://github.com/embeddings-benchmark/mteb/tree/1.38.51) - Mizan is currently based on MTEB v1.38.51 (MTEB v2.0.0 support coming soon)
+        """)
+    return demo
+def main():
+    """Main entry point"""
+    print("🚀 Starting Mizan Leaderboard...")
+    demo = create_leaderboard_demo()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )
+if __name__ == "__main__":
+    main()

config.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/usr/bin/env python3
+"""
+Configuration module for MTEB Turkish Leaderboard
+Centralizes environment variables and configuration settings
+"""
+import os
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+# API Configuration
+API_URL = os.environ.get("API_URL")
+USERNAME = os.environ.get("API_USERNAME")
+PASSWORD = os.environ.get("API_PASSWORD")
+# API Configuration (public settings)
+API_BASE_URL = "http://localhost:8000"
+API_TIMEOUT = 30
+# Polling and refresh intervals (public settings)
+POLL_INTERVAL = 5  # seconds
+LEADERBOARD_REFRESH_INTERVAL = 30  # seconds
+# CSV file path for leaderboard data
+CSV_FILE_PATH = "leaderboard_data.csv"

data_processor.py ADDED Viewed

	@@ -0,0 +1,208 @@

+#!/usr/bin/env python3
+"""
+Data Processing module for MTEB Turkish Leaderboard - HF Spaces Version
+Simplified version for loading and processing CSV data
+"""
+import os
+import pandas as pd
+from pandas.io.formats.style import Styler
+from matplotlib.colors import LinearSegmentedColormap
+import html
+# CSV file path
+CSV_FILE_PATH = "leaderboard_data.csv"
+def load_leaderboard_from_csv() -> pd.DataFrame:
+    """Load leaderboard data from CSV file"""
+    try:
+        if not os.path.exists(CSV_FILE_PATH):
+            print(f"❌ CSV file not found: {CSV_FILE_PATH}")
+            return create_empty_leaderboard_dataframe()
+        df = pd.read_csv(CSV_FILE_PATH)
+        print(f"✅ Loaded {len(df)} records from {CSV_FILE_PATH}")
+        # Convert to leaderboard format
+        leaderboard_df = csv_to_leaderboard_format(df)
+        # Sort by Mean (Task) score and add rankings
+        leaderboard_df = leaderboard_df.sort_values("Mean (Task)", ascending=False).reset_index(drop=True)
+        leaderboard_df["Rank"] = range(1, len(leaderboard_df) + 1)
+        return leaderboard_df
+    except Exception as e:
+        print(f"❌ Error loading CSV: {e}")
+        return create_empty_leaderboard_dataframe()
+def create_empty_leaderboard_dataframe() -> pd.DataFrame:
+    """Create an empty DataFrame with proper leaderboard column structure"""
+    return pd.DataFrame(columns=[
+        "Rank",
+        "Model",
+        "Mean (Task)",
+        "Mean (TaskType)",
+        "Classification",
+        "Clustering",
+        "Pair Classification",
+        "Retrieval",
+        "STS",
+        "Correlation",
+        "Parameters",
+        "Embed Dim",
+        "Max Sequence Length",
+        "Vocab Size",
+    ])
+def csv_to_leaderboard_format(df: pd.DataFrame) -> pd.DataFrame:
+    """Convert CSV data to leaderboard format"""
+    data = []
+    for idx, row in df.iterrows():
+        model_name = row['Model']
+        # Prepare model name for display
+        model_name_clean = html.escape(model_name)
+        # Create clickable HuggingFace link for model name
+        hf_link = f"https://huggingface.co/{model_name_clean}"
+        clickable_model = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{model_name_clean}</a>'
+        # Handle different column name variations
+        embedding_dim_col = 'Embedding Dim'
+        max_seq_col = 'Max Seq Length'
+        pair_classification_col = 'Pair Classification'
+        data_row = {
+            "Rank": idx + 1,  # Initial ranking, will be recalculated
+            "Model": clickable_model,
+            "Mean (Task)": round(float(row['Mean (Task)']), 2),
+            "Mean (TaskType)": round(float(row['Mean (TaskType)']), 2),
+            "Classification": round(float(row['Classification']), 2),
+            "Clustering": round(float(row['Clustering']), 2),
+            "Pair Classification": round(float(row[pair_classification_col]), 2),
+            "Retrieval": round(float(row['Retrieval']), 2),
+            "STS": round(float(row['STS']), 2),
+            "Correlation": round(float(row['Correlation']), 3) if not pd.isna(row['Correlation']) else "N/A",
+            "Parameters": row['Number of Parameters'],
+            "Embed Dim": int(float(row[embedding_dim_col])) if not pd.isna(row[embedding_dim_col]) else 0,
+            "Max Sequence Length": "N/A" if pd.isna(row[max_seq_col]) or row[max_seq_col] == "Unknown" else int(float(row[max_seq_col])),
+            "Vocab Size": int(float(row['Vocab Size'])) if 'Vocab Size' in row and not pd.isna(row['Vocab Size']) else 0
+        }
+        data.append(data_row)
+    result_df = pd.DataFrame(data)
+    return result_df
+def create_excel_like_cmap():
+    """Create Excel-like colormap for score visualization"""
+    colors = [
+        (0.9, 0.1, 0.2),       # Red
+        (1.0, 1.0, 0.0),       # Yellow
+        (0/255, 176/255, 80/255)  # Excel-style Green
+    ]
+    return LinearSegmentedColormap.from_list("excel_like", colors, N=256)
+def rgb_to_hex(rgb_tuple):
+    """Convert RGB tuple to hex color"""
+    r, g, b = [int(x * 255) for x in rgb_tuple[:3]]
+    return f"#{r:02x}{g:02x}{b:02x}"
+def create_colored_cell(value: float, min_val: float, max_val: float, colormap) -> str:
+    """Create colored cell HTML for score visualization"""
+    if pd.isna(value) or value == "N/A":
+        return str(value)
+    try:
+        # Normalize value to 0-1 range
+        if max_val > min_val:
+            normalized = (float(value) - min_val) / (max_val - min_val)
+        else:
+            normalized = 0.5
+        # Get color from colormap
+        color_rgba = colormap(normalized)
+        color_hex = rgb_to_hex(color_rgba)
+        # Create colored cell HTML with data-sort attribute for proper numeric sorting
+        return f'<div style="background-color: {color_hex}; padding: 4px 8px; border-radius: 4px; text-align: center; font-weight: bold; color: #333;" data-sort="{value}">{value}</div>'
+    except (ValueError, TypeError):
+        return str(value)
+def create_styled_leaderboard_dataframe(df: pd.DataFrame) -> Styler:
+    """Create styled leaderboard dataframe with color coding and clickable model names using pandas Styler
+    Returns a pandas Styler object that Gradio Dataframe can render with both colors AND correct sorting.
+    """
+    if df.empty:
+        return df.style
+    colormap = create_excel_like_cmap()
+    # Score columns to colorize
+    score_columns = ["Mean (Task)", "Mean (TaskType)", "Classification", "Clustering",
+                    "Pair Classification", "Retrieval", "STS", "Correlation"]
+    # Calculate min/max for each score column for normalization
+    color_ranges = {}
+    for col in score_columns:
+        if col in df.columns:
+            numeric_values = pd.to_numeric(df[col], errors='coerce')
+            if not numeric_values.isna().all():
+                color_ranges[col] = {
+                    'min': numeric_values.min(),
+                    'max': numeric_values.max()
+                }
+    # Create styler with background colors for score columns
+    def apply_color_gradient(val, col_name):
+        """Apply background color based on value"""
+        if col_name not in color_ranges:
+            return ''
+        if pd.isna(val) or val == "N/A":
+            return ''
+        try:
+            min_val = color_ranges[col_name]['min']
+            max_val = color_ranges[col_name]['max']
+            # Normalize value to 0-1 range
+            if max_val > min_val:
+                normalized = (float(val) - min_val) / (max_val - min_val)
+            else:
+                normalized = 0.5
+            # Get color from colormap
+            color_rgba = colormap(normalized)
+            color_hex = rgb_to_hex(color_rgba)
+            return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
+        except (ValueError, TypeError):
+            return ''
+    # Apply styling to score columns using map (applymap is deprecated)
+    styler = df.style
+    for col in score_columns:
+        if col in df.columns:
+            styler = styler.map(lambda val, c=col: apply_color_gradient(val, c), subset=[col])
+    # Format score columns to 2 decimal places
+    format_dict = {}
+    for col in score_columns:
+        if col in df.columns:
+            format_dict[col] = '{:.2f}'
+    if format_dict:
+        styler = styler.format(format_dict, na_rep='N/A')
+    return styler

evaluation_service.py ADDED Viewed

	@@ -0,0 +1,190 @@

+#!/usr/bin/env python3
+"""
+Evaluation Service module for MTEB Turkish Leaderboard
+Handles evaluation submissions and status tracking
+"""
+import time
+import re
+from typing import Optional, Tuple, List
+import traceback
+import pandas as pd
+import gradio as gr
+from api_client import send_evaluation_request_to_api, get_evaluation_status, cancel_evaluation_request
+# Global state management for active evaluations
+active_evaluations = {}  # request_id -> {"status": str, "model_name": str, "email": str, "start_time": float}
+def get_active_evaluations_status() -> str:
+    """Show status of active evaluations"""
+    if not active_evaluations:
+        return "🟢 No active evaluation requests"
+    status_lines = []
+    for request_id, info in active_evaluations.items():
+        model_name = info["model_name"]
+        email = info["email"]
+        elapsed = int(time.time() - info["start_time"])
+        status = info.get("status", "PENDING")
+        status_lines.append(f"🔄 {model_name} ({email}) - {request_id} [{status}] ({elapsed}s)")
+    return "\n".join(status_lines)
+def get_active_evaluations_with_cancel_options() -> Tuple[str, List[str]]:
+    """Get active evaluations status and cancellation options"""
+    status_text = get_active_evaluations_status()
+    cancel_options = []
+    for request_id, info in active_evaluations.items():
+        model_name = info["model_name"]
+        cancel_options.append(f"{request_id} - {model_name}")
+    return status_text, cancel_options
+def clear_active_evaluations() -> str:
+    """Clear all active evaluations from tracking"""
+    global active_evaluations
+    count = len(active_evaluations)
+    active_evaluations.clear()
+    return f"✅ Cleared {count} active evaluation(s) from tracking"
+def cancel_active_evaluation(selection: str) -> str:
+    """Cancel a selected active evaluation"""
+    if not selection:
+        return "❌ No evaluation selected for cancellation"
+    try:
+        request_id = selection.split(" - ")[0]
+        if request_id not in active_evaluations:
+            return f"❌ Evaluation {request_id} not found in active evaluations"
+        # Try to cancel via API
+        success = cancel_evaluation_request(request_id)
+        if success:
+            model_name = active_evaluations[request_id]["model_name"]
+            del active_evaluations[request_id]
+            return f"✅ Successfully cancelled evaluation for {model_name} (ID: {request_id})"
+        else:
+            return f"❌ Failed to cancel evaluation {request_id}. Check API connection."
+    except Exception as e:
+        return f"❌ Error cancelling evaluation: {str(e)}"
+def _validate_evaluation_request(model_name: str, email: str = None) -> Optional[str]:
+    """Validate evaluation request parameters"""
+    # Model name validation
+    if not model_name or not model_name.strip():
+        return "❌ Model name cannot be empty!"
+    model_name = model_name.strip()
+    # Check model name length (format: org/model-name)
+    if len(model_name) < 3:
+        return "❌ Model name too short!"
+    if len(model_name) > 256:
+        return "❌ Model name too long (maximum 256 characters)!"
+    # Check for valid HuggingFace model name format (must be org/model)
+    if '/' not in model_name:
+        return "❌ Invalid model name format! Must include organization (e.g., organization/model-name)"
+    if not re.match(r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$', model_name):
+        return "❌ Invalid model name format! Use format: organization/model-name"
+    # Email validation
+    if not email or not email.strip():
+        return "❌ Email address cannot be empty!"
+    email = email.strip()
+    if len(email) > 254:
+        return "❌ Email address too long!"
+    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
+    if not re.match(email_pattern, email):
+        return "❌ Invalid email address format!"
+    return None
+def submit_evaluation(model_name: str, email: str, batch_size: int, current_data: pd.DataFrame, progress=gr.Progress()) -> Tuple[str, Optional[pd.DataFrame]]:
+    try:
+        # Input validation
+        error_msg = _validate_evaluation_request(model_name, email)
+        if error_msg:
+            return error_msg, None
+        # Show progress
+        progress(0.1, desc="Sending evaluation request to API...")
+        # Send request to API - regardless of backend response, show success to user
+        api_response = send_evaluation_request_to_api(model_name, batch_size, email)
+        # Always show success message to user
+        # Backend errors (like duplicate requests) are handled by API and communicated via email
+        progress(1.0, desc="Request submitted successfully!")
+        # Return success message regardless of backend response
+        success_msg = f"""
+✅ Evaluation request submitted successfully!
+🤖 Model: {model_name}
+📧 Email: {email}
+📋 Next Steps:
+⏱️ Your request will be reviewed by our system
+📧 You will receive email notifications about the status of your evaluation
+🔄 If you've submitted this model before, you'll be notified via email
+Thank you for contributing to the Mizan Leaderboard!
+        """
+        return success_msg.strip(), current_data
+    except Exception as e:
+        # Log error for debugging
+        print(f"❌ Error submitting evaluation: {str(e)}")
+        traceback.print_exc()
+        error_msg = f"""
+❌ Failed to submit evaluation request
+🤖 Model: {model_name}
+📧 Email: {email}
+⚠️ Error: Unable to connect to the evaluation service.
+Please try again later or contact support if the problem persists.
+        """
+        return error_msg.strip(), None
+def refresh_evaluation_status() -> str:
+    """Refresh status of all active evaluations"""
+    if not active_evaluations:
+        return "🟢 No active evaluations to refresh"
+    updated_count = 0
+    for request_id, info in active_evaluations.items():
+        try:
+            status_data = get_evaluation_status(request_id)
+            if status_data and "status" in status_data:
+                old_status = info.get("status", "UNKNOWN")
+                new_status = status_data["status"]
+                if old_status != new_status:
+                    info["status"] = new_status
+                    updated_count += 1
+                    print(f"Status updated for {request_id}: {old_status} -> {new_status}")
+        except Exception as e:
+            print(f"Error refreshing status for {request_id}: {e}")
+    return f"🔄 Refreshed status for {len(active_evaluations)} evaluation(s). {updated_count} status change(s) detected."

leaderboard_data.csv ADDED Viewed

	@@ -0,0 +1,33 @@

+Model,Number of Parameters,Embedding Dim,Max Seq Length,Mean (Task),Mean (TaskType),Classification,Clustering,Pair Classification,Retrieval,STS,Correlation,Vocab Size
+BAAI/bge-m3,567M,1024,8192,69.39,63.51,75.68,35.26,78.88,57.89,69.83,0.61,250002
+intfloat/multilingual-e5-large,559M,1024,512,66.61,62.08,71.8,41.2,72.76,57.17,67.49,0.58,250002
+newmindai/TurkEmbed4STS,305M,768,8192,65.66,62.03,69.69,44.29,81.77,47.6,66.79,0.68,250048
+ytu-ce-cosmos/turkish-e5-large,559M,1024,512,64.93,59.73,72.42,38.51,70.86,47.6,69.24,0.56,250002
+intfloat/multilingual-e5-large-instruct,559M,1024,512,64.33,58.57,72.25,33.16,72.92,44.95,69.56,0.57,250002
+nomic-ai/nomic-embed-text-v2-moe,475M,768,512,64.28,60.15,70.07,41.28,63.87,56.4,69.16,0.53,250048
+Alibaba-NLP/gte-multilingual-base,305M,768,32768,63.86,60.04,68.0,39.16,76.0,50.12,66.94,0.62,250048
+sentence-transformers/paraphrase-multilingual-mpnet-base-v2,278M,768,512,63.33,57.63,70.88,41.35,83.6,33.81,58.51,0.65,250002
+newmindai/modernbert-base-tr-uncased-allnli-stsb,134M,768,8192,61.29,54.09,71.47,35.46,82.83,24.81,55.89,0.66,32000
+numind/NuSentiment-multilingual,278M,768,512,60.52,49.65,73.67,14.96,76.89,32.76,49.96,0.52,250002
+newmindai/TurkEmbed4Retrieval,305M,768,512,60.5,58.04,64.78,47.47,64.04,47.82,66.1,0.57,250048
+Qwen/Qwen3-Embedding-0.6B,595M,1024,131072,60.18,56.53,64.68,33.36,66.02,50.06,68.55,0.48,151669
+sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,117M,384,512,59.95,54.8,67.21,42.31,79.3,29.95,55.24,0.6,250037
+newmindai/TurkEmbed4STS-HD,305M,768,8192,59.94,53.06,67.61,34.24,80.08,35.88,47.47,0.65,250048
+emrecan/bert-base-turkish-cased-mean-nli-stsb-tr,110M,768,512,59.92,52.65,68.38,24.61,74.94,39.0,56.3,0.62,32000
+ibm-granite/granite-embedding-278m-multilingual,278M,768,512,55.9,54.48,58.64,41.98,60.13,45.08,66.57,0.41,250002
+newmindai/ModernBERT-tr-uncased-stsb-HD,134M,768,8192,54.51,43.94,67.17,17.96,82.51,16.08,35.98,0.61,32000
+ibm-granite/granite-embedding-107m-multilingual,106M,384,512,52.68,50.72,55.75,34.17,59.86,39.97,63.85,0.38,250002
+minishlab/potion-multilingual-128M,128M,256,N/A,50.39,44.47,58.34,23.47,59.76,30.84,49.93,0.43,500358
+google/embeddinggemma-300m,307M,768,2048,49.08,44.98,55.23,22.84,61.02,26.92,58.91,0.27,262144
+nomic-ai/nomic-embed-text-v1,136M,768,8192,45.12,41.46,48.3,9.45,59.75,32.9,56.88,0.42,30528
+nomic-ai/nomic-embed-text-v1.5,136M,768,8192,44.63,40.04,48.92,9.69,58.53,32.19,50.89,0.41,30528
+mixedbread-ai/mxbai-embed-large-v1,335M,1024,512,44.0,39.23,49.49,15.99,56.66,27.75,46.25,0.37,30522
+sentence-transformers/multi-qa-MiniLM-L6-cos-v1,22M,384,512,38.82,32.39,44.08,5.55,58.29,25.16,28.88,0.34,30522
+boun-tabi-LMG/TURNA,495M,1024,1024,38.36,30.96,47.17,10.26,56.62,13.04,27.73,0.22,32128
+sentence-transformers/all-MiniLM-L12-v2,33M,384,512,38.28,31.13,44.77,7.82,58.2,21.64,23.24,0.36,30522
+nielsr/lilt-xlm-roberta-base,284M,768,512,38.01,29.57,50.1,12.79,55.35,2.45,27.14,0.22,250002
+sentence-transformers/all-MiniLM-L6-v2,22M,384,512,37.95,31.97,44.46,6.58,56.75,16.48,35.55,0.31,30522
+sentence-transformers/all-mpnet-base-v2,109M,768,512,37.21,31.31,43.75,10.56,55.99,15.16,31.08,0.31,30527
+minishlab/potion-base-8M,7M,256,N/A,36.85,30.01,42.51,2.26,57.86,21.75,25.64,0.36,29528
+sentence-transformers/paraphrase-MiniLM-L6-v2,22M,384,512,36.26,28.19,44.02,4.53,56.62,17.47,18.29,0.33,30522
+newmindai/lettucedect-210m-eurobert-tr-v1,211M,768,8192,27.66,21.55,34.32,1.54,52.34,0.22,19.34,0.1,128256

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=5.49.1
+pandas>=2.3.3
+numpy>=2.3.4
+matplotlib>=3.10.7
+requests>=2.32.5
+python-dotenv>=1.1.1
+itsdangerous>=2.2.0

ui_components.py ADDED Viewed

	@@ -0,0 +1,259 @@

+#!/usr/bin/env python3
+"""
+UI Components module for MTEB Turkish Leaderboard - HF Spaces Version
+Simplified version with only leaderboard and dataset components
+"""
+import gradio as gr
+import pandas as pd
+from data_processor import (create_styled_leaderboard_dataframe,
+                            create_empty_leaderboard_dataframe)
+def create_leaderboard_tab(current_data: pd.DataFrame):
+    """Create the main leaderboard tab with color styling"""
+    # Handle empty or invalid data
+    if current_data.empty or "Model" not in current_data.columns:
+        print("⚠️ Warning: Empty or invalid data, using empty leaderboard structure")
+        current_data = create_empty_leaderboard_dataframe()
+    # Apply color styling to score columns using pandas Styler
+    styled_data = create_styled_leaderboard_dataframe(current_data)
+    leaderboard = gr.Dataframe(
+        value=styled_data,
+        interactive=False,
+        wrap=True,
+        max_height=600,
+        show_search=True,
+        datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number", "str", "number", "str", "number"],  # Model column as HTML for clickable links
+        column_widths=["70px", "250px", "130px", "130px", "160px", "130px", "170px", "130px", "100px", "130px", "120px", "120px", "120px", "120px"]
+    )
+    # Information about the leaderboard
+    gr.Markdown("""
+    ### 🔍 How to Use the Leaderboard:
+    - **Search**: Use the search box to find specific models
+    - **Color Coding**: Scores are color-coded from red (low) to green (high)
+    - **Sorting**: Click on column headers to sort by different metrics
+    - **Rankings**: Models are ranked by Mean (Task) score
+    ### 📊 Performance Insights:
+    - **Top Performers**: Models with Mean (Task) > 65 show strong overall performance
+    - **Specialized Models**: Some models excel in specific tasks (e.g., retrieval vs classification)
+    - **Model Size vs Performance**: Larger models generally perform better but with exceptions
+    """)
+    return leaderboard
+def create_dataset_tab():
+    """Create the dataset information tab"""
+    gr.Markdown("### 📊 MTEB Turkish Dataset Overview")
+    # Task name to dataset path mapping
+    task_to_dataset = {
+        'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
+        'XQuADRetrieval': 'google/xquad',
+        'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
+        'MKQARetrieval': 'apple/mkqa',
+        'MassiveIntentClassification': 'mteb/amazon_massive_intent',
+        'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
+        'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
+        'SIB200Classification': 'mteb/sib200',
+        'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
+        'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
+        'SIB200ClusteringS2S': 'mteb/sib200',
+        'XNLI': 'mteb/xnli',
+        'XNLIV2': 'mteb/xnli2.0-multi-pair',
+        'STS22.v2': 'mteb/sts22-crosslingual-sts'
+    }
+    # Create clickable task names
+    clickable_task_names = []
+    for task_name in [
+        'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
+        'MassiveIntentClassification', 'MassiveScenarioClassification',
+        'MultilingualSentimentClassification', 'SIB200Classification',
+        'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
+        'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
+    ]:
+        dataset_path = task_to_dataset[task_name]
+        hf_link = f"https://huggingface.co/datasets/{dataset_path}"
+        clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{task_name}</a>'
+        clickable_task_names.append(clickable_name)
+    # Create dataset information table
+    dataset_data = pd.DataFrame({
+        'Task Name': clickable_task_names,
+        'Task Type': [
+            'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
+            'Classification', 'Classification',
+            'Classification', 'Classification',
+            'Classification', 'Classification',
+            'Clustering', 'PairClassification', 'PairClassification', 'STS'
+        ],
+        'Description': [
+            'Turkish FAQ retrieval task',
+            'Turkish question answering retrieval',
+            'Historical Turkish document retrieval',
+            'Multilingual knowledge QA retrieval',
+            'Intent classification for Turkish',
+            'Scenario classification for Turkish',
+            'Multilingual sentiment classification',
+            'SIB200 language identification',
+            'Turkish movie review sentiment',
+            'Turkish product review sentiment',
+            'SIB200 clustering task',
+            'Turkish natural language inference',
+            'Enhanced Turkish NLI task',
+            'Turkish semantic textual similarity'
+        ],
+        'Domain': [
+            'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
+            'Intent', 'Scenario',
+            'Sentiment', 'Language ID',
+            'Movies', 'Products',
+            'Language ID', 'NLI', 'NLI', 'STS'
+        ],
+        'Samples': [
+            '~135K', '~10K', '~1.4K', '~10K',
+            '~11K', '~11K',
+            '~4.5K', '~700',
+            '~8K', '~4.8K',
+            '~1K', '~1.4K', '~1.4K', '~400'
+        ]
+    })
+    dataset_table = gr.Dataframe(
+        value=dataset_data,
+        label="MTEB Turkish Task Details",
+        interactive=False,
+        wrap=True,
+        datatype=["html", "str", "str", "str", "str"]  # First column (Task Name) as HTML for clickable links
+    )
+    # Task type distribution
+    gr.Markdown("""
+    ### 📈 Task Distribution:
+    **By Task Type:**
+    - **Classification**: 6 tasks (sentiment, intent, scenario, language identification)
+    - **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA)
+    - **Pair Classification**: 2 tasks (natural language inference)
+    - **Clustering**: 1 task (language clustering)
+    - **STS**: 1 task (semantic textual similarity)
+    **By Domain:**
+    - **Sentiment Analysis**: Movie and product reviews
+    - **Question Answering**: FAQ, reading comprehension, and knowledge QA
+    - **Intent/Scenario**: Conversational AI applications
+    - **Language Tasks**: NLI, STS, clustering
+    - **Multilingual**: Cross-lingual evaluation capabilities
+    """)
+    # Statistics summary
+    stats_data = pd.DataFrame({
+        'Metric': [
+            'Total Tasks',
+            'Total Samples',
+            'Task Types',
+            'Languages',
+            'Avg. Tokens per Sample'
+        ],
+        'Value': [
+            '14 tasks',
+            '~190K samples',
+            '5 types',
+            'Turkish + Multilingual',
+            '~150 tokens'
+        ],
+        'Notes': [
+            'Comprehensive evaluation across domains',
+            'Large-scale evaluation dataset',
+            'Classification, Retrieval, STS, NLI, Clustering',
+            'Focus on Turkish with multilingual support',
+            'Varies by task type and domain'
+        ]
+    })
+    gr.Dataframe(
+        value=stats_data,
+        label="Dataset Statistics Summary",
+        interactive=False
+    )
+    gr.Markdown("""
+    ### 🎯 Evaluation Methodology:
+    **Scoring:**
+    - Each task uses task-specific metrics (accuracy, F1, recall@k, etc.)
+    - **Mean (Task)**: Direct average of all individual task scores
+    - **Mean (TaskType)**: Average of task category means
+    - **Individual Categories**: Performance in each task type
+    **Model Ranking:**
+    - Primary ranking by **Mean (Task)** score
+    - Correlation metrics provide additional insights
+    - Task-specific performance shows model strengths
+    **Quality Assurance:**
+    - Standardized evaluation protocols
+    - Consistent preprocessing across tasks
+    - Multiple metrics per task for robustness
+    """)
+    return dataset_table
+def create_submit_evaluation_tab():
+    """Create the submit evaluation tab with form"""
+    gr.Markdown("### 🚀 Submit Model for Evaluation")
+    gr.Markdown("""
+    Submit your Turkish embedding model for evaluation on the MTEB Turkish benchmark.
+    **Authentication with Hugging Face is required to submit evaluations.**
+    """)
+    # OAuth login button
+    login_button = gr.LoginButton(value="Sign in with Hugging Face")
+    model_input = gr.Textbox(
+        label="🤖 Model Name",
+        placeholder="sentence-transformers/your-model",
+        info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
+    )
+    email_input = gr.Textbox(
+        label="📧 Email Address",
+        placeholder="your.email@example.com",
+        info="Email for notifications about evaluation status and results"
+    )
+    submit_btn = gr.Button(
+        "🚀 Submit",
+        variant="primary",
+        size="lg"
+    )
+    # Result output for authentication messages
+    result_output = gr.HTML(label="Status")
+    # Information about the evaluation process
+    gr.Markdown("""
+    ### 📋 Evaluation Process:
+    1. **Sign In**: First, sign in with your Hugging Face account using the button above
+    2. **Submit Request**: Fill out the form with your model details and email
+    3. **Admin Review**: Your request will be reviewed by administrators
+    4. **Evaluation**: If approved, your model will be evaluated on MTEB Turkish benchmark
+    5. **Results**: You'll receive email notifications and results will appear on the leaderboard
+    ### ⚠️ Important Notes:
+    - **Authentication Required**: You must be logged in with Hugging Face to submit evaluations
+    - You'll receive email updates about your request status
+    - Make sure your model is publicly available on HuggingFace
+    - Valid email address is required for receiving results
+    """)
+    return (model_input, email_input, submit_btn, login_button, result_output)