Spaces:

newmindai
/

Mezura

Running

File size: 57,613 Bytes

import pandas as pd
import json
import os
import glob
import gradio as gr
import traceback
import re
import plotly.express as px
import plotly.graph_objects as go
from src.envs import API, TOKEN, REPO_ID
import requests
import logging
from datetime import datetime
from dotenv import load_dotenv
from utils.rag_score_calculator import RAGScoreCalculator

# Logger setup
logger = logging.getLogger("mezura.utils")

# Setup a dedicated logger for tracking model submissions
submission_logger = logging.getLogger("mezura.submissions")
submission_handler = logging.FileHandler("submissions.log")
submission_formatter = logging.Formatter('%(asctime)s - %(message)s')
submission_handler.setFormatter(submission_formatter)
submission_logger.addHandler(submission_handler)
submission_logger.setLevel(logging.INFO)

# Model metadata lookup table - centralized for all benchmark functions
MODEL_METADATA_LOOKUP = {
    "mistralai/Magistral-Small-2506": {"license": "Apache 2.0", "dtype": "bfloat16"},
    "newmindai/Qwen2.5-72B-Instruct": {"license": "Qwen", "dtype": "bfloat16"},
    "Qwen/Qwen2.5-72B-Instruct": {"license": "Qwen", "dtype": "bfloat16"},
    "deepseek-ai/DeepSeek-R1": {"license": "MIT", "dtype": "bfloat16"},
    "Qwen/Qwen3-32B": {"license": "Qwen", "dtype": "bfloat16"},
    "newmindai/QwQ-32B-r1": {"license": "Apache 2.0", "dtype": "bfloat16"},
    "google/gemma-3-27b-it": {"license": "Gemma", "dtype": "bfloat16"},
    "Qwen/Qwen3-14B": {"license": "Apache 2.0", "dtype": "bfloat16"},
    "newmindai/Llama-3.3-70b-Instruct": {"license": "Llama-3.3", "dtype": "bfloat16"},
    "Qwen/QwQ-32B": {"license": "Apache 2.0", "dtype": "bfloat16"},
    "microsoft/phi-4": {"license": "MIT", "dtype": "bfloat16"},
    "meta-llama/Meta-Llama-3.1-70B-Instruct": {"license": "Llama 3.1", "dtype": "bfloat16"},
    "grok-3": {"license": "Proprietary", "dtype": "Unknown"},
    "grok-3-mini-fast": {"license": "Proprietary", "dtype": "Unknown"},
    "meta-llama/Llama-3.3-70B-Instruct": {"license": "Llama-3.3", "dtype": "bfloat16"},
    "meta-llama/Llama-3.3-70b-Instruct": {"license": "Llama 3.3", "dtype": "bfloat16"},  # lowercase b variant
    "newmindai/Qwen2.5-72b-Instruct": {"license": "Qwen", "dtype": "bfloat16"},  # lowercase b variant
    "grok-3-mini-fast-beta": {"license": "Proprietary", "dtype": "Unknown"},  # beta variant
    # Legacy entries for backward compatibility
    "deepseek-r1-distill-llama-70b": {"license": "MIT", "dtype": "bfloat16"},
    "qwen-qwq-32b": {"license": "Apache 2.0", "dtype": "bfloat16"}
}

def log_model_submission(repo_id, base_model):
    """
    Logs model submission details to a dedicated log file
    
    Args:
        repo_id: The repository ID of the model
        base_model: The base model used
    """
    submission_logger.info(f"SUBMISSION - REPO_ID: {repo_id}, BASE_MODEL: {base_model}")

def restart_space():
    try:
        if API is not None:
            API.restart_space(repo_id=REPO_ID, token=TOKEN)
        else:
            print("Warning: API is None, cannot restart space")
    except Exception as e:
        print(f"Warning: Could not restart space: {e}")


def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    # Always include model and model_type_symbol columns
    selected_columns = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
    
    # Add selected columns
    for column in columns:
        if column in df.columns:
            selected_columns.append(column)
    
    # Add dummy column for search
    selected_columns.append(AutoEvalColumn.dummy.name)
    
    return df[selected_columns]


def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
    if not query:
        return filtered_df

    # Split query by ; and filter for each part
    queries = query.split(";")
    filtered_dfs = []

    for q in queries:
        q = q.strip()
        if not q:
            continue
        filtered_dfs.append(filtered_df[filtered_df[AutoEvalColumn.dummy.name].str.contains(q, case=False)])

    if not filtered_dfs:
        return filtered_df

    # Combine all filtered dataframes
    return pd.concat(filtered_dfs).drop_duplicates()


def filter_models(
    df: pd.DataFrame
) -> pd.DataFrame:
    # Show all models
    filtered_df = df.copy()
    
    # Always filter out deleted models
    filtered_df = filtered_df[filtered_df[AutoEvalColumn.still_on_hub.name]]

    return filtered_df


# Yeni fonksiyonlar
def load_benchmark_results():
    """
    Load benchmark results from local files
    """
    results = {
        "avg": {
            "evalmix": [],
            "light_eval": [],
            "snake": [],
            "retrieval": [],
            "arena": [],
            "human_arena": []
        },
        "raw": {
            "evalmix": [],
            "light_eval": [],
            "snake": [],
            "retrieval": [],
            "arena": [],
            "human_arena": []
        }
    }
    
    # Define benchmark types to look for
    benchmark_types = ["evalmix", "light_eval", "snake", "retrieval", "arena", "human_arena"]  # "lm_harness" removed
    
    # Initialize RAG Score calculator for runtime calculation
    rag_calculator = None
    rag_scores_cache = {}  # Cache for RAG scores by run_id
    try:
        rag_calculator = RAGScoreCalculator()
        if rag_calculator.stats:
            logger.info("RAG Score calculator initialized successfully")
            # Pre-calculate RAG scores from detail files
            for data in rag_calculator.all_data:
                run_id = data.get('run_id')
                if run_id:
                    rag_score = rag_calculator.calculate_rag_score(data)
                    rag_scores_cache[run_id] = rag_score
            logger.info(f"Pre-calculated {len(rag_scores_cache)} RAG scores")
        else:
            logger.warning("No RAG statistics available for score calculation")
    except Exception as e:
        logger.warning(f"Could not initialize RAG Score calculator: {e}")
        rag_calculator = None
    
    # Load raw JSON files (detailed results)
    for benchmark_type in benchmark_types:
        dir_path = f"result/{benchmark_type}"
        # if benchmark_type == "lm_harness" and not os.path.exists(dir_path):
        #     dir_path = "result/lmharness"
        
        # Skip if directory doesn't exist
        if not os.path.exists(dir_path):
            continue
        
        # Load avg files for leaderboard
        avg_files = glob.glob(f"{dir_path}/avg_*.json")
        
        for file in avg_files:
            try:
                with open(file, "r") as f:
                    data = json.load(f)
                    
                    # Handle different data formats
                    if isinstance(data, list):
                        # If data is a list, convert it to a dictionary
                        if benchmark_type == "arena" and len(data) > 0:
                            # For arena, create a dictionary with model_name
                            processed_data = {
                                "model_name": f"Model {os.path.basename(file).replace('avg_', '').split('.')[0]}",
                                "file": os.path.basename(file)
                            }
                            
                            # Add metrics from the list if available
                            if len(data) > 0:
                                for i, item in enumerate(data):
                                    if isinstance(item, dict):
                                        for key, value in item.items():
                                            processed_data[f"item_{i}_{key}"] = value
                            
                            data = processed_data
                        else:
                            # For other types, create a dictionary with model_name
                            data = {"model_name": f"Model {os.path.basename(file).replace('avg_', '').split('.')[0]}"}
                    else:
                        # Ensure data is a dictionary
                        if not isinstance(data, dict):
                            data = {"model_name": f"Model {os.path.basename(file).replace('avg_', '').split('.')[0]}"}
                    
                    # Add file information
                    data["file"] = os.path.basename(file)
                    
                    # Ensure model_name exists
                    if "model_name" not in data or not data["model_name"]:
                        # Extract model ID from filename
                        file_name = os.path.basename(file)
                        model_id = file_name.replace("avg_", "").split(".")[0]
                        data["model_name"] = f"Model {model_id}"
                    
                    # Format the model name nicely for display
                    if "model_name" in data:
                        data["model_name"] = format_model_name(data["model_name"])
                    
                    # Add pre-calculated RAG Score for retrieval data (from detail files cache)
                    if benchmark_type == "retrieval" and rag_scores_cache:
                        run_id = data.get('run_id')
                        if run_id and run_id in rag_scores_cache:
                            data["RAG_score"] = rag_scores_cache[run_id]
                            logger.debug(f"Added cached RAG_score {rag_scores_cache[run_id]} for avg file {data.get('model_name', 'unknown')}")
                        else:
                            logger.debug(f"No cached RAG_score found for run_id: {run_id}")
                    
                    results["avg"][benchmark_type].append(data)
            except Exception as e:
                print(f"Error loading {benchmark_type} avg file: {file} - {e}")
        
        # Load detail files for pipeline-specific views
        detail_files = glob.glob(f"{dir_path}/detail_*.json")
        
        for file in detail_files:
            try:
                with open(file, "r") as f:
                    data = json.load(f)
                    
                    # Handle different data formats
                    if isinstance(data, list):
                        # If data is a list, convert it to a dictionary
                        if benchmark_type == "arena" and len(data) > 0:
                            # For arena, create a dictionary with model_name
                            processed_data = {
                                "model_name": f"Model {os.path.basename(file).replace('detail_', '').split('.')[0]}",
                                "file": os.path.basename(file)
                            }
                            
                            # Add metrics from the list if available
                            if len(data) > 0:
                                for i, item in enumerate(data):
                                    if isinstance(item, dict):
                                        for key, value in item.items():
                                            processed_data[f"item_{i}_{key}"] = value
                            
                            data = processed_data
                        else:
                            # For other types, create a dictionary with model_name
                            data = {"model_name": f"Model {os.path.basename(file).replace('detail_', '').split('.')[0]}"}
                    else:
                        # Ensure data is a dictionary
                        if not isinstance(data, dict):
                            data = {"model_name": f"Model {os.path.basename(file).replace('detail_', '').split('.')[0]}"}
                    
                    # Add file information
                    data["file"] = os.path.basename(file)
                    
                    # Ensure model_name exists
                    if "model_name" not in data or not data["model_name"]:
                        # Extract model ID from filename
                        file_name = os.path.basename(file)
                        model_id = file_name.replace("detail_", "").split(".")[0]
                        data["model_name"] = f"Model {model_id}"
                    
                    # Format the model name nicely for display
                    if "model_name" in data:
                        data["model_name"] = format_model_name(data["model_name"])
                    
                    # Add pre-calculated RAG Score for retrieval data (from cache)
                    if benchmark_type == "retrieval" and rag_scores_cache:
                        run_id = data.get('run_id')
                        if run_id and run_id in rag_scores_cache:
                            data["RAG_score"] = rag_scores_cache[run_id]
                            logger.debug(f"Added cached RAG_score {rag_scores_cache[run_id]} for detail file {data.get('model_name', 'unknown')}")
                        else:
                            logger.debug(f"No cached RAG_score found for detail run_id: {run_id}")
                    
                    results["raw"][benchmark_type].append(data)
                    
                    # Also add to default results to ensure we have all models in the leaderboard
                    # This ensures that models from detail files are also shown in the leaderboard
                    # Create a simplified version with just the model name and basic metrics
                    simplified_data = {"model_name": data["model_name"], "file": data["file"]}
                    
                    # Extract key metrics based on benchmark type
                    if benchmark_type == "retrieval":
                        # For RAG Judge, extract RAG_score, RAG_success_rate and average_judge_score if available
                        # RAG_score should be available since we just calculated it above
                        if "RAG_score" in data:
                            simplified_data["RAG_score"] = data["RAG_score"]
                        if "RAG_success_rate" in data:
                            simplified_data["RAG_success_rate"] = data["RAG_success_rate"]
                        if "average_judge_score" in data:
                            simplified_data["average_judge_score"] = data["average_judge_score"]
                    
                    # Add to default results if not already present
                    if not any(item.get("model_name") == data["model_name"] for item in results["avg"][benchmark_type]):
                        results["avg"][benchmark_type].append(simplified_data)
            except Exception as e:
                print(f"Error loading {benchmark_type} detail file: {file} - {e}")
    
    return results

def format_model_name(model_name):
    """
    Formats model names for better display in leaderboards:
    - Replaces underscores with spaces
    - Preserves original casing
    
    Args:
        model_name: Original model name string
        
    Returns:
        str: Formatted model name
    """
    if not model_name:
        return model_name
        
    # Split model name by organization/model if present
    if "/" in model_name:
        org, name = model_name.split("/", 1)
        # Format the model part only - replace underscores with spaces but preserve casing
        formatted_name = name.replace("_", " ")
        return f"{org}/{formatted_name}"
    else:
        # Format the whole name - replace underscores with spaces but preserve casing
        return model_name.replace("_", " ")

def create_evalmix_table(data):
    """
    Hybrid benchmark sonuçlarından tablo oluşturur
    """
    if not data:
        return pd.DataFrame()
    
    # Apply model name formatting and add metadata from lookup table
    for item in data:
        if "model_name" in item:
            raw_model_name = item["model_name"]
            item["model_name"] = format_model_name(raw_model_name)
            
            # Always use lookup table values for metadata (override JSON values)
            for field in ["dtype", "license"]:
                if raw_model_name in MODEL_METADATA_LOOKUP:
                    item[field] = MODEL_METADATA_LOOKUP[raw_model_name][field]
                else:
                    defaults = {"dtype": "unknown", "license": "Unknown"}
                    item[field] = defaults[field]
    
    df = pd.DataFrame(data)
    
    # Remove the file column if present
    if 'file' in df.columns:
        df = df.drop(columns=['file'])
    
    # Remove all sample count columns
    sample_columns = ["total_samples", "Total Samples", "samples_number"]
    for col in sample_columns:
        if col in df.columns:
            df = df.drop(columns=[col])
    
    if "model_name" in df.columns:
        df = df.sort_values(by="model_name")
    
    # Ortalama metrik ekle - now handling the case when judge_metric is not available
    if all(col in df.columns for col in ["lexical_metric", "semantic_metric"]):
        if "judge_metric" in df.columns:
            df["average_score"] = df[["lexical_metric", "semantic_metric", "judge_metric"]].mean(axis=1).round(2)
        else:
            df["average_score"] = df[["lexical_metric", "semantic_metric"]].mean(axis=1).round(2)
    
    # Float değerleri 2 ondalık basamağa yuvarla
    for column in df.columns:
        try:
            if pd.api.types.is_float_dtype(df[column]):
                df[column] = df[column].round(2)
        except:
            continue
    
    # Format column names for better display
    column_mapping = {}
    for col in df.columns:
        # Skip already well-formatted columns
        if col == "model_name":
            column_mapping[col] = "Model Name"
            continue
            
        # Special handling for Turkish and Multilingual Semantic
        if "turkish_semantic" in col.lower():
            column_mapping[col] = "Turkish Semantic"
            continue
            
        if "multilingual_semantic" in col.lower():
            column_mapping[col] = "Multilingual Semantic"
            continue
            
        # Special handling for certain columns
        if col == "average_score":
            column_mapping[col] = "Average Score"
            continue
        if col == "lexical_metric":
            column_mapping[col] = "Lexical Score"
            continue
        if col == "semantic_metric":
            column_mapping[col] = "Semantic Score"
            continue
        if col == "judge_metric":
            column_mapping[col] = "Judge Score"
            continue
        if col == "openai_accuracy":
            column_mapping[col] = "OpenAI Accuracy"
            continue
        if col == "dtype":
            column_mapping[col] = "Dtype"
            continue
        if col == "license":
            column_mapping[col] = "License"
            continue
        
        # Format column name
        formatted_col = " ".join([word.capitalize() for word in col.replace("_", " ").split()])
        column_mapping[col] = formatted_col
    
    # Rename DataFrame columns
    df = df.rename(columns=column_mapping)
    
    # Sort by openai_accuracy if present, otherwise use Average Score
    if "Turkish Semantic" in df.columns:
        df = df.sort_values(by="Turkish Semantic", ascending=False)
    elif "turkish_semantic" in df.columns:
        df = df.sort_values(by="turkish_semantic", ascending=False)
    
    # Define desired column order for EvalMix - metadata columns at the end
    desired_cols = [
        "Model Name",
        "Turkish Semantic", 
        "Multilingual Semantic", 
        "Average Score",
        "Lexical Score", 
        "Semantic Score", 
        "Judge Score", 
        "OpenAI Accuracy",
        "Dtype",
        "License"
    ]
    
    # Filter out columns that don't exist in the DataFrame
    final_cols = [col for col in desired_cols if col in df.columns]
    
    # Add any remaining columns that weren't in the desired list
    remaining_cols = [col for col in df.columns if col not in final_cols]
    final_cols.extend(remaining_cols)
    
    # Set the new column order
    df = df[final_cols]
    
    return df

def create_light_eval_table(data, is_detail=False):
    """
    Creates a table from Light Eval results
    
    Args:
        data: Light eval data
        is_detail: If True, keep 4 decimal places for detail results
    """
    if not data:
        return pd.DataFrame()
    
    # Light eval sonuçları farklı formatta, düzenleme gerekiyor
    formatted_data = []
    for item in data:
        model_data = {"model_name": format_model_name(item.get("model_name", "Bilinmeyen Model"))}
        
        # Add specific metrics we're interested in
        metrics = [
            "overall_average",
            "mmlu_average", 
            "truthfulqa", 
            "winogrande", 
            "hellaswag", 
            "gsm8k", 
            "arc_challenge",
            "dtype", 
            "license"
            # Removed total_samples
        ]
        
        for metric in metrics:
            try:
                if metric in ["dtype", "license"]:
                    # Always use lookup table for metadata (override JSON values)
                    raw_model_name = item.get("model_name", "")
                    if raw_model_name in MODEL_METADATA_LOOKUP:
                        model_data[metric] = MODEL_METADATA_LOOKUP[raw_model_name][metric]
                    else:
                        # Default values for unknown models
                        defaults = {"dtype": "unknown", "license": "Unknown"}
                        model_data[metric] = defaults[metric]
                elif metric in item:
                    if metric == "overall_average" and item[metric] == "N/A":
                        model_data[metric] = "N/A"
                    elif isinstance(item[metric], str) and item[metric] != "N/A":
                        model_data[metric] = float(item[metric])
                    else:
                        model_data[metric] = item[metric]
                else:
                    model_data[metric] = "N/A"
            except Exception as e:
                if metric in ["dtype", "license"]:
                    defaults = {"dtype": "unknown", "license": "Unknown"}
                    model_data[metric] = defaults[metric]
                else:
                    model_data[metric] = item.get(metric, "N/A")
        
        formatted_data.append(model_data)
    
    # Create DataFrame
    df = pd.DataFrame(formatted_data)
    
    # Remove the file column if present
    if 'file' in df.columns:
        df = df.drop(columns=['file'])
    
    # Try to convert metrics to float with error handling (only numeric columns)
    numeric_cols = ["overall_average", "mmlu_average", "truthfulqa", "winogrande", "hellaswag", "gsm8k", "arc_challenge"]
    for col in numeric_cols:
        if col in df.columns:
            try:
                # Convert column to float but keep "N/A" as is
                df[col] = df[col].apply(lambda x: float(x) if isinstance(x, (int, float)) or (isinstance(x, str) and x != "N/A") else x)
            except Exception as e:
                pass  # Keep original values if conversion fails
    
    # Sort by overall_average if available
    if "overall_average" in df.columns:
        # For sorting, replace non-numeric values with NaN temporarily
        sort_col = pd.to_numeric(df["overall_average"], errors="coerce")
        # Sort with NaN at the end
        df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]]
    
    # Float değerleri yuvarlama - detail için 4 hane, avg için 2 hane
    decimal_places = 4 if is_detail else 2
    for column in df.columns:
        try:
            if pd.api.types.is_float_dtype(df[column]):
                df[column] = df[column].round(decimal_places)
        except:
            continue
    
    # Format column names according to user request
    column_mapping = {
        "model_name": "Model Name",
        "overall_average": "Overall",
        "mmlu_average": "MMLU",
        "truthfulqa": "Truthfulqa",
        "winogrande": "Winogrande",
        "hellaswag": "Hellaswag",
        "gsm8k": "Gsm8k",
        "arc_challenge": "ARC",
        "dtype": "Dtype",
        "license": "License"
    }
    
    # Rename DataFrame columns
    df = df.rename(columns=column_mapping)
    
    # Define desired column order for Light-Eval - metadata columns at the end
    desired_cols = [
        "Model Name",
        "Overall", 
        "MMLU", 
        "Truthfulqa", 
        "Winogrande", 
        "Hellaswag", 
        "Gsm8k", 
        "ARC",
        "Dtype",
        "License"
    ]
    
    # Filter out columns that don't exist in the DataFrame
    final_cols = [col for col in desired_cols if col in df.columns]
    
    # Add any remaining columns that weren't in the desired list
    remaining_cols = [col for col in df.columns if col not in final_cols]
    final_cols.extend(remaining_cols)
    
    # Set the new column order
    df = df[final_cols]
    
    return df

def create_benchmark_plots(benchmark_data, data_type="avg"):
    """
    Benchmark verilerinden grafikler oluşturur
    
    Args:
        benchmark_data: Benchmark verileri
        data_type: "avg" veya "raw" olabilir
    """
    plots = {}
    
    # Hybrid Benchmark için çubuk grafik
    if benchmark_data[data_type]["evalmix"]:
        df = create_evalmix_table(benchmark_data[data_type]["evalmix"])
        if not df.empty and all(col in df.columns for col in ["model_name", "lexical_metric", "semantic_metric"]):
            # Determine which metrics are available
            metrics = ["lexical_metric", "semantic_metric"]
            if "judge_metric" in df.columns:
                metrics.append("judge_metric")
                
            # Veriyi uzun formata dönüştür
            plot_df = pd.melt(
                df, 
                id_vars=["model_name"], 
                value_vars=metrics,
                var_name="Metrik", 
                value_name="Değer"
            )
            
            # Metrik isimlerini daha okunabilir hale getir
            plot_df["Metrik"] = plot_df["Metrik"].replace({
                "lexical_metric": "Lexical Metric",
                "semantic_metric": "Semantic Metric",
                "judge_metric": "Judge Metric"
            })
            
            fig = px.bar(
                plot_df, 
                x="model_name", 
                y="Değer", 
                color="Metrik",
                title="Hybrid Benchmark Results",
                labels={"model_name": "Model", "Değer": "Score"},
                barmode="group"
            )
            plots["evalmix"] = fig
    
    # Light Eval için radar grafik
    if benchmark_data[data_type]["light_eval"]:
        df = create_light_eval_table(benchmark_data[data_type]["light_eval"])
        if not df.empty:
            # Ortalama ve total_samples sütunlarını hariç tut
            metric_cols = [col for col in df.columns if col not in ["model_name", "Ortalama", "file", "overall_average", "total_samples"]]
            if metric_cols:
                fig = go.Figure()
                
                for _, row in df.iterrows():
                    fig.add_trace(go.Scatterpolar(
                        r=[row[col] for col in metric_cols],
                        theta=metric_cols,
                        fill='toself',
                        name=row.get("model_name", "Unknown Model")
                    ))
                
                fig.update_layout(
                    polar=dict(
                        radialaxis=dict(
                            visible=True,
                            range=[0, 1]
                        )
                    ),
                    title="Light Eval Results",
                    showlegend=True
                )
                plots["light_eval"] = fig
    
    return plots

def create_combined_leaderboard_table(benchmark_data):
    """
    Creates a combined leaderboard table from avg JSON data
    """
    # Define benchmark types to include in the leaderboard
    benchmark_types = ["evalmix", "light_eval", "retrieval", "arena", "human_arena"]  # "lm_harness" removed
    
    all_models = {}
    
    # Process each benchmark type - exclude snake
    for benchmark_type in benchmark_types:
        # For human_arena and retrieval, use raw data since avg files don't have complete info
        if benchmark_type in ["human_arena", "retrieval"]:
            data_source = benchmark_data["raw"][benchmark_type]
        else:
            data_source = benchmark_data["avg"][benchmark_type]
        
        # Skip if no data for this benchmark type
        if not data_source:
            continue
        
        # Process each model in this benchmark type
        for item in data_source:
            model_name = item.get("model_name", "")
            if not model_name:
                continue
                
            # Format the model name
            formatted_model_name = format_model_name(model_name)
                
            # Create entry for this model if it doesn't exist
            if formatted_model_name not in all_models:
                all_models[formatted_model_name] = {"model_name": formatted_model_name}
                
                # Add metadata fields using lookup table
                for field in ["dtype", "license"]:
                    if model_name in MODEL_METADATA_LOOKUP:
                        all_models[formatted_model_name][field] = MODEL_METADATA_LOOKUP[model_name][field]
                    else:
                        defaults = {"dtype": "unknown", "license": "Unknown"}
                        all_models[formatted_model_name][field] = defaults[field]
            
            # Extract only the fields we care about for each benchmark type
            if benchmark_type == "evalmix":
                if "lexical_metric" in item:
                    all_models[formatted_model_name]["Lexical"] = round(item.get("lexical_metric", 0), 2)
                if "semantic_metric" in item:
                    all_models[formatted_model_name]["Multilingual Semantic"] = round(item.get("semantic_metric", 0), 2)
                # Extract Turkish Semantic score if available
                if "turkish_semantic" in item:
                    all_models[formatted_model_name]["Turkish Semantic"] = round(item.get("turkish_semantic", 0), 2)
                elif "turkish_semantic_" in item:
                    all_models[formatted_model_name]["Turkish Semantic"] = round(item.get("turkish_semantic_", 0), 2)
                elif "nlp_metrics" in item and "cosine_similarity_turkish" in item.get("nlp_metrics", {}):
                    turkish_sim = item.get("nlp_metrics", {}).get("cosine_similarity_turkish", {}).get("mean", 0)
                    all_models[formatted_model_name]["Turkish Semantic"] = round(turkish_sim, 2)
                
                # Extract Multilingual Semantic explicitly if available
                if "multilingual_semantic" in item:
                    all_models[formatted_model_name]["Multilingual Semantic"] = round(item.get("multilingual_semantic", 0), 2)
                elif "multilingual_semantic_" in item:
                    all_models[formatted_model_name]["Multilingual Semantic"] = round(item.get("multilingual_semantic_", 0), 2)
                elif "nlp_metrics" in item and "cosine_similarity_multilingual" in item.get("nlp_metrics", {}):
                    multi_sim = item.get("nlp_metrics", {}).get("cosine_similarity_multilingual", {}).get("mean", 0)
                    all_models[formatted_model_name]["Multilingual Semantic"] = round(multi_sim, 2)
                
                # Extract BERTScore F1 if available
                if "bert_score" in item and isinstance(item.get("bert_score"), dict) and "f1" in item.get("bert_score", {}):
                    bert_f1 = item.get("bert_score", {}).get("f1", {}).get("mean", 0)
                    all_models[formatted_model_name]["BERTScore F1"] = round(bert_f1, 2)
                elif "nlp_metrics" in item and "bert_score" in item.get("nlp_metrics", {}):
                    bert_f1 = item.get("nlp_metrics", {}).get("bert_score", {}).get("f1", {}).get("mean", 0)
                    all_models[formatted_model_name]["BERTScore F1"] = round(bert_f1, 2)
                # Remove dtype and license from JSON - use only lookup table values
            elif benchmark_type == "light_eval":
                if "overall_average" in item:
                    try:
                        if isinstance(item["overall_average"], str) and item["overall_average"] != "N/A":
                            avg_value = float(item["overall_average"])
                        else:
                            avg_value = item["overall_average"]
                        all_models[formatted_model_name]["Light Eval"] = round(avg_value, 2)
                    except (ValueError, TypeError):
                        all_models[formatted_model_name]["Light Eval"] = item["overall_average"]
                # Remove dtype and license from JSON - use only lookup table values
            elif benchmark_type == "retrieval":
                # Prefer RAG_score if available, otherwise use RAG_success_rate
                if "RAG_score" in item:
                    avg_value = item["RAG_score"]
                    all_models[formatted_model_name]["Retrieval"] = round(avg_value, 4)  # Higher precision for RAG Score
                elif "RAG_success_rate" in item:
                    avg_value = item["RAG_success_rate"]
                    all_models[formatted_model_name]["Retrieval"] = round(avg_value, 2)
                # Remove dtype and license from JSON - use only lookup table values
            elif benchmark_type == "arena":
                if "Melo Score" in item:
                    all_models[formatted_model_name]["Auto Elo Score"] = round(item.get("Melo Score", 0), 2)
                # Remove dtype and license from JSON - use only lookup table values
            elif benchmark_type == "human_arena":
                if "elo_rating" in item:
                    all_models[formatted_model_name]["Human Elo Score"] = round(item.get("elo_rating", 0), 2)
                # Remove dtype and license from JSON - use only lookup table values
    
    # Create DataFrame from the collected data
    if all_models:
        df = pd.DataFrame(list(all_models.values()))
        
        # Rename model_name column to be more user-friendly
        if "model_name" in df.columns:
            df = df.rename(columns={"model_name": "Model Name"})
        
        # Rename metadata columns to proper case
        column_mapping = {
            "dtype": "Dtype",
            "license": "License"
        }
        df = df.rename(columns=column_mapping)
        
        # Make sure to remove the file column if it's present
        if 'file' in df.columns:
            df = df.drop(columns=['file'])
            
        # Remove run_id and user_id fields if present
        for field in ['run_id', 'user_id', 'Run Id', 'User Id']:
            if field in df.columns:
                df = df.drop(columns=[field])
        
        # Define the exact columns we want to display in the order we want them
        display_cols = [
            "Auto Elo Score",
            "Human Elo Score",
            "Retrieval",
            "Light Eval", 
            "Turkish Semantic",
            "Multilingual Semantic",
            "Lexical",
            "Dtype",
            "License"
        ]
        valid_display_cols = [col for col in display_cols if col in df.columns]
        
        # Fill NaN values with 0
        for col in valid_display_cols:
            df[col] = df[col].fillna(0)
        
        # Explicitly reorder columns to match the UI display order exactly as in the screenshot
        desired_order = ["Model Name", "Auto Elo Score", "Human Elo Score", "Retrieval", "Light Eval", "Turkish Semantic", "Multilingual Semantic", "Lexical", "Dtype", "License"]
        
        # Filter out columns that don't exist in the DataFrame
        actual_order = [col for col in desired_order if col in df.columns]
        
        # Reorder columns
        if len(actual_order) > 0:
            df = df[actual_order]
        
        # Sort by Auto Elo Score if available, otherwise by Human Elo Score
        if "Auto Elo Score" in df.columns:
            df = df.sort_values(by="Auto Elo Score", ascending=False)
        elif "Human Elo Score" in df.columns:
            df = df.sort_values(by="Human Elo Score", ascending=False)
        
        # Float değerleri 2 ondalık basamağa yuvarla
        for column in df.columns:
            try:
                if pd.api.types.is_float_dtype(df[column]):
                    df[column] = df[column].round(2)
            except:
                continue
        
        return df
    
    return pd.DataFrame()

def create_raw_details_table(benchmark_data, benchmark_type):
    """
    Creates a detailed table from raw JSON data for a specific benchmark type
    """
    if not benchmark_data["raw"][benchmark_type]:
        return pd.DataFrame()
    
    # Flatten the raw data
    flattened_data = []
    
    for item in benchmark_data["raw"][benchmark_type]:
        raw_model_name = item.get("model_name", "Unknown Model")
        flat_item = {
            "file": item.get("file", ""),
            "model_name": format_model_name(raw_model_name)
        }
        
        # Always use lookup table values for metadata (override JSON values)
        for field in ["dtype", "license"]:
            if raw_model_name in MODEL_METADATA_LOOKUP:
                flat_item[field] = MODEL_METADATA_LOOKUP[raw_model_name][field]
            else:
                defaults = {"dtype": "unknown", "license": "Unknown"}
                flat_item[field] = defaults[field]
        
        # Define metadata fields to exclude - especially for LightEval
        excluded_fields = ["file", "job_id", "start_time", "end_time", "run_id", "user_id", 
                          "total_samples", "Total Samples", "samples_number", "sample_count", "eval_samples",
                          "total_success_references", "Total Success References", "total_eval_samples", 
                          "provider", "Provider"]  # Exclude provider fields
        
        # For LightEval, also exclude mmlu_tasks field
        if benchmark_type == "light_eval":
            excluded_fields.append("mmlu_tasks")
        
        # Add top-level fields (skip metadata fields and dtype/license which come from lookup table)
        for key, value in item.items():
            if key not in excluded_fields and key not in ["dtype", "license"] and not key.startswith("_") and not isinstance(value, (dict, list)):
                flat_item[key] = value
        
        # Flatten nested fields
        for key, value in item.items():
            if key.startswith("_") or key in excluded_fields:
                # Skip metadata fields
                continue
            elif isinstance(value, dict):
                # Flatten nested dictionaries
                _flatten_dict(value, flat_item, prefix=key)
            elif isinstance(value, list) and all(isinstance(x, dict) for x in value):
                # Flatten list of dictionaries
                for i, sub_dict in enumerate(value):
                    _flatten_dict(sub_dict, flat_item, prefix=f"{key}_{i}")
        
        flattened_data.append(flat_item)
    
    # Create DataFrame
    df = pd.DataFrame(flattened_data)
    
    # Format confidence interval for arena data
    if benchmark_type == "arena" and "95%(CI)" in df.columns:
        def format_confidence_interval(ci_value):
            """Convert '-1.65/+2.66' to '+2.66/-1.65' format"""
            if isinstance(ci_value, str) and "/" in ci_value:
                parts = ci_value.split("/")
                if len(parts) == 2:
                    negative_part = parts[0].strip()
                    positive_part = parts[1].strip()
                    
                    # Remove the signs and get the numbers
                    if negative_part.startswith("-"):
                        negative_num = negative_part[1:]
                    else:
                        negative_num = negative_part
                    
                    if positive_part.startswith("+"):
                        positive_num = positive_part[1:]
                    else:
                        positive_num = positive_part
                    
                    # Return in +positive/-negative format
                    return f"+{positive_num}/-{negative_num}"
            return ci_value
        
        df["95%(CI)"] = df["95%(CI)"].apply(format_confidence_interval)
    
    # Ensure model_name is first column
    if "model_name" in df.columns:
        cols = ["model_name"] + [col for col in df.columns if col != "model_name"]
        df = df[cols]
    
    # Float değerleri 2 ondalık basamağa yuvarla
    for column in df.columns:
        try:
            if pd.api.types.is_float_dtype(df[column]):
                df[column] = df[column].round(2)
        except:
            continue
    
    # Remove the file column
    if 'file' in df.columns:
        df = df.drop(columns=['file'])
    
    # Format column names for better display based on benchmark type
    column_mapping = {
        "model_name": "Model Name",
        "dtype": "Dtype",
        "license": "License"
    }
    
    # Use specific column mappings for each benchmark type
    if benchmark_type == "arena":
        # Arena benchmark column mappings
        custom_columns = {
            "Melo Score": "Auto Elo Score",
            "Win Rate": "Win Rate",
            "95%(CI)": "95% CI",
            "Response Tokens Average": "Completion Tokens",
            "dtype": "Dtype",
            "Licance": "License",
        }
        column_mapping.update(custom_columns)
    
    elif benchmark_type == "retrieval":
        # RAG benchmark column mappings
        custom_columns = {
            "RAG_score": "RAG Score",
            "RAG_success_rate": "Rag Success Rate",
            "max_correct_references": "Max Correct Ref.",
            "total_false_positives": "Hallucinate Ref.",
            "total_missed_references": "Missed Ref.",
            "average_judge_score": "Legal Judge Score"
            # Removed "samples_number": "Total Samples"
        }
        column_mapping.update(custom_columns)
    
    elif benchmark_type == "evalmix":
        # Hybrid/EvalMix benchmark column mappings
        custom_columns = {
            "turkish_semantic_mean": "Turkish Semantic",
            "turkish_semantic": "Turkish Semantic",
            "multilingual_semantic_mean": "Multilingual Semantic",
            "multilingual_semantic": "Multilingual Semantic",
            "judge_metric": "Judge Score",
            "bleu mean": "BLEU",
            "rouge1 mean": "ROUGE-1",
            "rouge2 mean": "ROUGE-2",
            "rougeL mean": "ROUGE-L",
            "bert_score f1 mean": "BERTScore F1",
            "dtype": "Dtype",
            "license": "License",
            "bert_score precision mean": "BERTScore Precision"
            # Removed "total_samples": "Total Samples"
        }
        column_mapping.update(custom_columns)
        
        # Calculate Judge Average Score from OpenAI scores if they exist
        if all(col in df.columns for col in ["openai_accuracy", "openai_relevance", "openai_coherence"]):
            df["judge_average_score"] = df[["openai_accuracy", "openai_relevance", "openai_coherence"]].mean(axis=1).round(2)
            column_mapping["judge_average_score"] = "Judge Score"

            # Remove individual OpenAI score columns
            columns_to_drop = ["openai_accuracy", "openai_relevance", "openai_coherence"]
            for col in columns_to_drop:
                if col in df.columns:
                    df = df.drop(columns=[col])
    
    elif benchmark_type == "light_eval":
        # Light Eval benchmark column mappings
        custom_columns = {
            "overall_average": "Overall",
            "mmlu_average": "MMLU",
            "truthfulqa": "Truthfulqa",
            "winogrande": "Winogrande",
            "hellaswag": "Hellaswag",
            "gsm8k": "Gsm8k",
            "arc_challenge": "ARC",
            "dtype": "Dtype",
            "license": "License"
        }
        column_mapping.update(custom_columns)
    
    elif benchmark_type == "snake":
        # Snake benchmark column mappings
        custom_columns = {
            "elo": "Elo Rating",
            "win_rate": "Win Rate",
            "draw_rate": "Draw Rate",
            "dtype": "Dtype",
            "license": "License"
        }
        column_mapping.update(custom_columns)
        

    
    # For any columns not specifically mapped, use the default formatting
    for col in df.columns:
        if col not in column_mapping:
            # Remove "mean" from column names
            cleaned_col = col.replace(" mean", "")
            # Format column name with default formatting
            formatted_col = " ".join([word.capitalize() for word in cleaned_col.replace("_", " ").split()])
            column_mapping[col] = formatted_col
    
    # Rename DataFrame columns
    df = df.rename(columns=column_mapping)
    
    # Drop specific columns based on benchmark type
    if benchmark_type == "retrieval" and "Success Ref." in df.columns:
        df = df.drop(columns=["Success Ref."])
    # Drop "Total Success References" column if it exists
    if "Total Success References" in df.columns:
        df = df.drop(columns=["Total Success References"])
    
    # Sort by specific metrics based on benchmark type - AFTER column renaming
    if benchmark_type == "arena" and "Auto Elo Score" in df.columns:
        df = df.sort_values(by="Auto Elo Score", ascending=False)
        
        # Define desired column order for Arena - metadata columns at the end
        desired_cols = [
            "Model Name",
            "Auto Elo Score", 
            "Win Rate", 
            "95% CI", 
            "Completion Tokens",
            "Dtype",
            "License"
        ]
        
        # Filter out columns that don't exist in the DataFrame
        final_cols = [col for col in desired_cols if col in df.columns]
        
        # Add any remaining columns that weren't in the desired list
        remaining_cols = [col for col in df.columns if col not in final_cols]
        final_cols.extend(remaining_cols)
        
        # Set the new column order
        df = df[final_cols]
        
    elif benchmark_type == "retrieval":
        # Sort by RAG Score if available, otherwise by Rag Success Rate
        if "RAG Score" in df.columns:
            df = df.sort_values(by="RAG Score", ascending=False)
            primary_metric = "RAG Score"
        elif "Rag Success Rate" in df.columns:
            df = df.sort_values(by="Rag Success Rate", ascending=False)
            primary_metric = "Rag Success Rate"
        else:
            primary_metric = None
        
        # Define desired column order for Retrieval - metadata columns at the end
        desired_cols = [
            "Model Name",
            "RAG Score",
            "Rag Success Rate", 
            "Max Correct Ref.", 
            "Hallucinate Ref.", 
            "Missed Ref.", 
            "Legal Judge Score",
            "Dtype",
            "License"
        ]
        
        # Filter out columns that don't exist in the DataFrame
        final_cols = [col for col in desired_cols if col in df.columns]
        
        # Add any remaining columns that weren't in the desired list
        remaining_cols = [col for col in df.columns if col not in final_cols]
        final_cols.extend(remaining_cols)
        
        # Set the new column order
        df = df[final_cols]
    elif benchmark_type == "evalmix":
        if "Turkish Semantic" in df.columns:
            df = df.sort_values(by="Turkish Semantic", ascending=False)
            
            # Define desired column order
            desired_cols = [
                "Model Name",
                "Turkish Semantic", 
                "Multilingual Semantic", 
                "Judge Score", 
                "BLEU", 
                "ROUGE-1", 
                "ROUGE-2", 
                "ROUGE-L", 
                "BERTScore F1", 
                "BERTScore Precision", 
                "BERTScore Recall",
                "Dtype",
                "License"
                # "Total Samples" removed
            ]
            
            # Filter out columns that don't exist in the DataFrame
            final_cols = [col for col in desired_cols if col in df.columns]
            
            # Set the new column order
            df = df[final_cols]
    
    # elif benchmark_type == "lm_harness" and "Overall" in df.columns:
    #     df = df.sort_values(by="Overall", ascending=False)
    elif benchmark_type == "light_eval" and "Overall" in df.columns:
        df = df.sort_values(by="Overall", ascending=False)
    elif benchmark_type == "snake":
        # Sort by Elo or Elo Rating if available
        if "Elo Rating" in df.columns:
            df = df.sort_values(by="Elo Rating", ascending=False)
        elif "Elo" in df.columns:
            df = df.sort_values(by="Elo", ascending=False)
            
        # Define desired column order for Snake - metadata columns at the end
        desired_cols = [
            "Model Name",
            "Elo Rating", 
            "Win Rate", 
            "Draw Rate",
            "Wins",
            "Losses", 
            "Ties",
            "Loss Rate",
            "Dtype",
            "License"
        ]
        
        # Filter out columns that don't exist in the DataFrame
        final_cols = [col for col in desired_cols if col in df.columns]
        
        # Add any remaining columns that weren't in the desired list
        remaining_cols = [col for col in df.columns if col not in final_cols]
        final_cols.extend(remaining_cols)
        
        # Set the new column order
        df = df[final_cols]

    return df

def _flatten_dict(d, target_dict, prefix=""):
    """
    Flattens nested dictionaries
    
    Args:
        d: Dictionary to flatten
        target_dict: Target dictionary to add flattened values to
        prefix: Key prefix
    """
    # List of fields to exclude when flattening
    excluded_fields = ["total_success_references", "total_eval_samples", 
                      "details", "metadata", "config", "logs"]
    
    # List of special field name transformations
    special_field_mappings = {
        "turkish_semantic_mean": "turkish_semantic",
        "turkish_semantic_ mean": "turkish_semantic",
        "multilingual_semantic_mean": "multilingual_semantic"
    }
    
    for key, value in d.items():
        # Skip excluded fields
        if key in excluded_fields:
            continue
        
        # Apply special field name transformations
        transformed_key = special_field_mappings.get(key, key)
            
        new_key = f"{prefix}_{transformed_key}" if prefix else transformed_key
        
        if isinstance(value, dict):
            # Flatten nested dictionaries
            _flatten_dict(value, target_dict, new_key)
        elif isinstance(value, list) and all(isinstance(x, dict) for x in value):
            # Flatten list of dictionaries
            for i, sub_dict in enumerate(value):
                _flatten_dict(sub_dict, target_dict, f"{new_key}_{i}")
        elif isinstance(value, list) and len(value) > 0:
            # Convert simple lists to string
            try:
                # For numeric lists, calculate mean and std
                if all(isinstance(x, (int, float)) for x in value):
                    import numpy as np
                    target_dict[f"{new_key}_mean"] = round(sum(value) / len(value), 2)
                    if len(value) > 1:
                        target_dict[f"{new_key}_std"] = round(np.std(value), 2)
                else:
                    # For non-numeric lists, convert to string
                    target_dict[new_key] = str(value)
            except:
                # Fallback to string representation
                target_dict[new_key] = str(value)
        else:
            # Add other values directly
            # Float değerleri yuvarla
            if isinstance(value, float):
                target_dict[new_key] = round(value, 2)
            else:
                target_dict[new_key] = value

def update_supported_base_models():
    """
    Updates the list of supported base models by querying API.
    This function is called when the application starts to keep the base model list up to date.
    """
    try:
        import requests
        import json
        import re
        from dotenv import load_dotenv
        import os
        
        # Load environment variables from .env file
        load_dotenv()
        
        # Get API key from environment variable
        api_key = os.getenv("API_KEY")
        if not api_key:
            logger.error("API_KEY not found in environment variables")
            return None
        
        # API endpoint and headers
        url = os.getenv("API_URL")
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}"
        }
        
        # Test payload with non-existent model
        payload = {
            "source": "FILE_ID_BURAYA_GELECEK",
            "base_model": "non-existent-model/fake-model-123",
            "name": "test-invalid-model",
            "description": "Desteklenen modelleri görmeye çalışıyorum"
        }
        
        # Make the request
        response = requests.post(url, headers=headers, json=payload)
        
        # Extract supported models from error message
        if response.status_code != 200:
            error_detail = response.json().get("detail", "")
            # Extract the list of supported models using regex
            match = re.search(r"list of supported models: \[(.*?)\]", error_detail)
            if match:
                supported_models_str = match.group(1)
                # Parse the list of models without filtering out 'fast' models
                supported_models = [model.strip("'") for model in supported_models_str.split(", ")]
                
                # Update the base model list in the configuration
                from api.config import update_base_model_list
                update_base_model_list(supported_models)
                
                logger.info(f"Successfully updated supported base models: {supported_models}")
                return supported_models
            else:
                logger.error("Could not extract supported models from API response")
                return None
        else:
            logger.error("Unexpected successful response from API")
            return None
            
    except Exception as e:
        logger.error(f"Error updating supported base models: {str(e)}")
        return None 

def create_human_arena_table(data):
    """
    Create Human Arena results table from detail data
    """
    if not data:
        return pd.DataFrame()
    
    # Apply model name formatting and add metadata from lookup table
    for item in data:
        if "model_name" in item:
            raw_model_name = item["model_name"]
            item["model_name"] = format_model_name(raw_model_name)
            
            # Always use lookup table values for metadata (override JSON values)
            for field in ["dtype", "license"]:
                if raw_model_name in MODEL_METADATA_LOOKUP:
                    item[field] = MODEL_METADATA_LOOKUP[raw_model_name][field]
                else:
                    defaults = {"dtype": "unknown", "license": "Unknown"}
                    item[field] = defaults[field]
    
    df = pd.DataFrame(data)
    
    # Ensure model_name is first column
    if "model_name" in df.columns:
        cols = ["model_name"] + [col for col in df.columns if col != "model_name"]
        df = df[cols]
    
    # Define column mapping for better display
    column_mapping = {
        'model_name': 'Model Name',
        'elo_rating': 'Human Elo Score',
        'wins': 'Wins',
        'losses': 'Losses',
        'ties': 'Ties',
        'total_games': 'Total Games',
        'win_rate': 'Win Rate (%)',
        'votes': 'Votes',
        'dtype': 'Dtype',
        'license': 'License',
        'evaluation_date': 'Evaluation Date',
        'evaluation_type': 'Type'
    }
    
    # Rename columns
    df = df.rename(columns=column_mapping)
    
    # Remove file, run_id, evaluation_date, evaluation_type, votes, and provider columns if present
    columns_to_remove = ['file', 'run_id', 'Evaluation Date', 'Type', 'provider', 'Provider', 'Votes']
    for col in columns_to_remove:
        if col in df.columns:
            df = df.drop(columns=[col])
    
    # Sort by Human Elo Score in descending order
    if 'Human Elo Score' in df.columns:
        df = df.sort_values(by='Human Elo Score', ascending=False)
    
    # Round numeric columns
    numeric_cols = ['Human Elo Score', 'Win Rate (%)']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').round(2)
    
    return df