Spaces:

newmindai
/

Mizan

Running

File size: 10,129 Bytes

9a235dc

#!/usr/bin/env python3
"""
UI Components module for MTEB Turkish Leaderboard - HF Spaces Version
Simplified version with only leaderboard and dataset components
"""

import gradio as gr
import pandas as pd
from data_processor import (create_styled_leaderboard_dataframe, 
                            create_empty_leaderboard_dataframe)


def create_leaderboard_tab(current_data: pd.DataFrame):
    """Create the main leaderboard tab with color styling"""
    
    # Handle empty or invalid data
    if current_data.empty or "Model" not in current_data.columns:
        print("⚠️ Warning: Empty or invalid data, using empty leaderboard structure")
        current_data = create_empty_leaderboard_dataframe()
    
    # Apply color styling to score columns using pandas Styler
    styled_data = create_styled_leaderboard_dataframe(current_data)
    
    leaderboard = gr.Dataframe(
        value=styled_data,
        interactive=False,
        wrap=True,
        max_height=600,
        show_search=True,
        datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number", "str", "number", "str", "number"],  # Model column as HTML for clickable links
        column_widths=["70px", "250px", "130px", "130px", "160px", "130px", "170px", "130px", "100px", "130px", "120px", "120px", "120px", "120px"]
    )
    
    # Information about the leaderboard
    gr.Markdown("""
    ### 🔍 How to Use the Leaderboard:
    - **Search**: Use the search box to find specific models
    - **Color Coding**: Scores are color-coded from red (low) to green (high)
    - **Sorting**: Click on column headers to sort by different metrics
    - **Rankings**: Models are ranked by Mean (Task) score
    
    ### 📊 Performance Insights:
    - **Top Performers**: Models with Mean (Task) > 65 show strong overall performance
    - **Specialized Models**: Some models excel in specific tasks (e.g., retrieval vs classification)
    - **Model Size vs Performance**: Larger models generally perform better but with exceptions
    """)
    
    return leaderboard


def create_dataset_tab():
    """Create the dataset information tab"""
    
    gr.Markdown("### 📊 MTEB Turkish Dataset Overview")
    
    # Task name to dataset path mapping
    task_to_dataset = {
        'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
        'XQuADRetrieval': 'google/xquad', 
        'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
        'MKQARetrieval': 'apple/mkqa',
        'MassiveIntentClassification': 'mteb/amazon_massive_intent',
        'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
        'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
        'SIB200Classification': 'mteb/sib200',
        'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
        'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
        'SIB200ClusteringS2S': 'mteb/sib200',
        'XNLI': 'mteb/xnli',
        'XNLIV2': 'mteb/xnli2.0-multi-pair',
        'STS22.v2': 'mteb/sts22-crosslingual-sts'
    }
    
    # Create clickable task names
    clickable_task_names = []
    for task_name in [
        'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
        'MassiveIntentClassification', 'MassiveScenarioClassification',
        'MultilingualSentimentClassification', 'SIB200Classification',
        'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
        'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
    ]:
        dataset_path = task_to_dataset[task_name]
        hf_link = f"https://huggingface.co/datasets/{dataset_path}"
        clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{task_name}</a>'
        clickable_task_names.append(clickable_name)
    
    # Create dataset information table
    dataset_data = pd.DataFrame({
        'Task Name': clickable_task_names,
        'Task Type': [
            'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
            'Classification', 'Classification', 
            'Classification', 'Classification',
            'Classification', 'Classification',
            'Clustering', 'PairClassification', 'PairClassification', 'STS'
        ],
        'Description': [
            'Turkish FAQ retrieval task',
            'Turkish question answering retrieval',
            'Historical Turkish document retrieval',
            'Multilingual knowledge QA retrieval',
            'Intent classification for Turkish',
            'Scenario classification for Turkish',
            'Multilingual sentiment classification',
            'SIB200 language identification',
            'Turkish movie review sentiment',
            'Turkish product review sentiment',
            'SIB200 clustering task',
            'Turkish natural language inference',
            'Enhanced Turkish NLI task',
            'Turkish semantic textual similarity'
        ],
        'Domain': [
            'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
            'Intent', 'Scenario',
            'Sentiment', 'Language ID',
            'Movies', 'Products',
            'Language ID', 'NLI', 'NLI', 'STS'
        ],
        'Samples': [
            '~135K', '~10K', '~1.4K', '~10K',
            '~11K', '~11K',
            '~4.5K', '~700',
            '~8K', '~4.8K',
            '~1K', '~1.4K', '~1.4K', '~400'
        ]
    })
    
    dataset_table = gr.Dataframe(
        value=dataset_data,
        label="MTEB Turkish Task Details",
        interactive=False,
        wrap=True,
        datatype=["html", "str", "str", "str", "str"]  # First column (Task Name) as HTML for clickable links
    )
    
    # Task type distribution
    gr.Markdown("""
    ### 📈 Task Distribution:
    
    **By Task Type:**
    - **Classification**: 6 tasks (sentiment, intent, scenario, language identification)
    - **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA)  
    - **Pair Classification**: 2 tasks (natural language inference)
    - **Clustering**: 1 task (language clustering)
    - **STS**: 1 task (semantic textual similarity)
    
    **By Domain:**
    - **Sentiment Analysis**: Movie and product reviews
    - **Question Answering**: FAQ, reading comprehension, and knowledge QA
    - **Intent/Scenario**: Conversational AI applications
    - **Language Tasks**: NLI, STS, clustering
    - **Multilingual**: Cross-lingual evaluation capabilities
    """)
    
    # Statistics summary
    stats_data = pd.DataFrame({
        'Metric': [
            'Total Tasks',
            'Total Samples',
            'Task Types',
            'Languages',
            'Avg. Tokens per Sample'
        ],
        'Value': [
            '14 tasks',
            '~190K samples',
            '5 types',
            'Turkish + Multilingual',
            '~150 tokens'
        ],
        'Notes': [
            'Comprehensive evaluation across domains',
            'Large-scale evaluation dataset',
            'Classification, Retrieval, STS, NLI, Clustering',
            'Focus on Turkish with multilingual support',
            'Varies by task type and domain'
        ]
    })
    
    gr.Dataframe(
        value=stats_data,
        label="Dataset Statistics Summary",
        interactive=False
    )
    
    gr.Markdown("""
    ### 🎯 Evaluation Methodology:
    
    **Scoring:**
    - Each task uses task-specific metrics (accuracy, F1, recall@k, etc.)
    - **Mean (Task)**: Direct average of all individual task scores
    - **Mean (TaskType)**: Average of task category means
    - **Individual Categories**: Performance in each task type
    
    **Model Ranking:**
    - Primary ranking by **Mean (Task)** score
    - Correlation metrics provide additional insights
    - Task-specific performance shows model strengths
    
    **Quality Assurance:**
    - Standardized evaluation protocols
    - Consistent preprocessing across tasks
    - Multiple metrics per task for robustness
    """)
    
    return dataset_table 

def create_submit_evaluation_tab():
    """Create the submit evaluation tab with form"""
    
    gr.Markdown("### 🚀 Submit Model for Evaluation")
    gr.Markdown("""
    Submit your Turkish embedding model for evaluation on the MTEB Turkish benchmark.
    **Authentication with Hugging Face is required to submit evaluations.**
    """)
    
    # OAuth login button
    login_button = gr.LoginButton(value="Sign in with Hugging Face")
    
    model_input = gr.Textbox(
        label="🤖 Model Name",
        placeholder="sentence-transformers/your-model",
        info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
    )
    
    email_input = gr.Textbox(
        label="📧 Email Address",
        placeholder="your.email@example.com",
        info="Email for notifications about evaluation status and results"
    )
    
    submit_btn = gr.Button(
        "🚀 Submit", 
        variant="primary",
        size="lg"
    )
    
    # Result output for authentication messages
    result_output = gr.HTML(label="Status")
    
    # Information about the evaluation process
    gr.Markdown("""
    ### 📋 Evaluation Process:
    1. **Sign In**: First, sign in with your Hugging Face account using the button above
    2. **Submit Request**: Fill out the form with your model details and email
    3. **Admin Review**: Your request will be reviewed by administrators
    4. **Evaluation**: If approved, your model will be evaluated on MTEB Turkish benchmark
    5. **Results**: You'll receive email notifications and results will appear on the leaderboard
    
    ### ⚠️ Important Notes:
    - **Authentication Required**: You must be logged in with Hugging Face to submit evaluations
    - You'll receive email updates about your request status
    - Make sure your model is publicly available on HuggingFace
    - Valid email address is required for receiving results
    """)
    
    return (model_input, email_input, submit_btn, login_button, result_output)