Spaces:

Synthefy
/

MUSEval

Running

App Files Files Community

Calcharles commited on Oct 9

Commit

22df562

1 Parent(s): 057702d

updated demo

Browse files

Files changed (17) hide show

.gitignore +26 -0
app.py +196 -498
demo.py +56 -0
requirements.txt +4 -0
requirements_local.txt +3 -0
results/sample_submission/metadata.json +9 -0
sample_bulk_submission.json → results/sample_submission/sample_bulk_submission.json +0 -0
results/sample_submission2/metadata.json +9 -0
results/sample_submission2/results.json +1292 -0
src/__init__.py +21 -0
src/about.py +109 -0
src/display/css_html_js.py +24 -0
src/display/utils.py +68 -0
src/envs.py +24 -0
src/load_results.py +285 -0
src/populate.py +63 -0
src/utils.py +91 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,26 @@

+uv.lock
+test_local.py
+test_visualization.py
+pyproject.toml
+# Python cache files
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg

app.py CHANGED Viewed

@@ -1,530 +1,228 @@
 import gradio as gr
 import pandas as pd
-import json
-import os
-from datetime import datetime
-from typing import Dict, List, Any
-import numpy as np
-# Sample results will be loaded from sample_bulk_submission.json
-def load_results() -> List[Dict]:
-    """Load results from file or return sample data"""
-    results_file = "results.json"
-    if os.path.exists(results_file):
-        with open(results_file, 'r') as f:
-            data = json.load(f)
-            return data.get("results", [])
-    # Load sample data from bulk submission file
-    sample_file = "sample_bulk_submission.json"
-    if os.path.exists(sample_file):
-        with open(sample_file, 'r') as f:
-            sample_data = json.load(f)
-            # Convert bulk submission format to results format
-            results = []
-            for entry in sample_data:
-                result = {
-                    "model": "EXAMPLE",
-                    "submitter": "Research Team",
-                    "submission_date": "2025-10-09",
-                    "metrics": entry["metrics"],
-                    "task": "multivariate_forecasting",
-                    "domain": entry["domain"],
-                    "category": entry["category"],
-                    "dataset": entry["dataset"],
-                    "dataset_version": entry["dataset_version"],
-                    "paper_url": "https://example.com/paper1",
-                    "code_url": "https://github.com/example/repo1"
-                }
-                results.append(result)
-            return results
-    # Fallback empty results
-    return []
-def save_results(results: List[Dict]):
-    """Save results to file"""
-    results_file = "results.json"
-    data = {"results": results}
-    with open(results_file, 'w') as f:
-        json.dump(data, f, indent=2)
-def create_leaderboard_df(results: List[Dict]) -> pd.DataFrame:
-    """Create a pandas DataFrame for the leaderboard display"""
-    if not results:
-        return pd.DataFrame()
-    # Extract metrics and create flattened structure
-    flattened_results = []
-    for result in results:
-        metrics = result["metrics"]
-        row = {
-            "Rank": 0,  # Will be calculated
-            "Model": result["model"],
-            "Submitter": result["submitter"],
-            "Submission Date": result["submission_date"],
-            "MAE": f"{metrics['MAE']:.3f}",
-            "Uni-MAE": f"{metrics.get('Uni-MAE', 0):.3f}",
-            "RMSE": f"{metrics['RMSE']:.3f}",
-            "MAPE": f"{metrics['MAPE']:.1f}%",
-            "R²": f"{metrics['R²']:.3f}",
-            "SMAPE": f"{metrics['SMAPE']:.1f}%",
-            "Uni-Multi": f"{metrics.get('Uni-Multi', 0):.3f}",
-            "Task": result["task"],
-            "Domain": result.get("domain", "general"),
-            "Category": result.get("category", "traditional"),
-            "Dataset": result.get("dataset", "MUSED-FM"),
-            "Dataset Version": result["dataset_version"]
-        }
-        flattened_results.append(row)
-    # Sort by MAE (lower is better) and assign ranks
-    flattened_results.sort(key=lambda x: float(x["MAE"]))
-    for i, row in enumerate(flattened_results):
-        row["Rank"] = i + 1
-    return pd.DataFrame(flattened_results)
-def submit_model(model_name: str, submitter_name: str, mae: float, uni_mae: float, rmse: float,
-                mape: float, r2: float, smape: float, uni_multi: float, task: str,
-                domain: str, category: str, dataset: str, dataset_version: str, paper_url: str, code_url: str) -> str:
-    """Handle model submission"""
-    try:
-        # Validate inputs
-        if not model_name or not submitter_name:
-            return "❌ Model name and submitter name are required!"
-        if mae <= 0 or uni_mae <= 0 or rmse <= 0 or mape < 0 or r2 < 0 or smape < 0 or uni_multi <= 0:
-            return "❌ All metrics must be positive values!"
-        # Load existing results
-        results = load_results()
-        # Check if model already exists
-        for result in results:
-            if result["model"].lower() == model_name.lower():
-                return f"❌ Model '{model_name}' already exists in the leaderboard!"
-        # Create new submission
-        new_submission = {
-            "model": model_name,
-            "submitter": submitter_name,
-            "submission_date": datetime.now().strftime("%Y-%m-%d"),
-            "metrics": {
-                "MAE": float(mae),
-                "Uni-MAE": float(uni_mae),
-                "RMSE": float(rmse),
-                "MAPE": float(mape),
-                "R²": float(r2),
-                "SMAPE": float(smape),
-                "Uni-Multi": float(uni_multi)
-            },
-            "task": task,
-            "domain": domain,
-            "category": category,
-            "dataset": dataset,
-            "dataset_version": dataset_version,
-            "paper_url": paper_url,
-            "code_url": code_url
-        }
-        # Add to results
-        results.append(new_submission)
-        save_results(results)
-        return f"✅ Successfully submitted model '{model_name}' to the leaderboard!"
-    except Exception as e:
-        return f"❌ Error submitting model: {str(e)}"
-def update_leaderboard_by_domain(domain: str = "all"):
-    """Update the leaderboard display filtered by domain"""
-    results = load_results()
-    if domain != "all":
-        results = [r for r in results if r.get("domain", "general") == domain]
-    df = create_leaderboard_df(results)
-    return df
-def update_leaderboard_by_category(category: str = "all"):
-    """Update the leaderboard display filtered by category"""
-    results = load_results()
-    if category != "all":
-        results = [r for r in results if r.get("category", "traditional") == category]
-    df = create_leaderboard_df(results)
-    return df
-def update_leaderboard_overall(domain_filter: str = "all", category_filter: str = "all", dataset_filter: str = "all"):
-    """Update the overall leaderboard display with optional filtering"""
-    results = load_results()
-    # Apply filters
-    if domain_filter != "all":
-        results = [r for r in results if r.get("domain", "general") == domain_filter]
-    if category_filter != "all":
-        results = [r for r in results if r.get("category", "traditional") == category_filter]
-    if dataset_filter != "all":
-        results = [r for r in results if r.get("dataset", "MUSED-FM") == dataset_filter]
-    df = create_leaderboard_df(results)
-    return df
-def get_domains():
-    """Get list of available domains"""
-    results = load_results()
-    domains = list(set([r.get("domain", "general") for r in results]))
-    return ["all"] + sorted(domains)
-def get_datasets():
-    """Get list of available datasets"""
-    results = load_results()
-    datasets = list(set([r.get("dataset", "MUSED-FM") for r in results]))
-    return ["all"] + sorted(datasets)
-def get_categories():
-    """Get list of available categories"""
-    results = load_results()
-    categories = list(set([r.get("category", "traditional") for r in results]))
-    return ["all"] + sorted(categories)
-def get_datasets_by_domain_category(domain: str, category: str):
-    """Get datasets filtered by domain and category"""
-    results = load_results()
-    filtered_results = [r for r in results if
-                       (domain == "all" or r.get("domain", "general") == domain) and
-                       (category == "all" or r.get("category", "traditional") == category)]
-    datasets = list(set([r.get("dataset", "MUSED-FM") for r in filtered_results]))
-    return ["all"] + sorted(datasets)
-def submit_bulk_results(model_name: str, submitter_name: str, results_data: str, paper_url: str, code_url: str) -> str:
-    """Handle bulk submission of results for multiple domain/dataset combinations"""
-    try:
-        import json
-        # Parse the bulk results data
-        bulk_data = json.loads(results_data)
-        if not isinstance(bulk_data, list):
-            return "❌ Bulk data must be a list of result entries!"
-        # Load existing results
-        existing_results = load_results()
-        # Validate and add each result
-        added_count = 0
-        for result_entry in bulk_data:
-            # Validate required fields
-            required_fields = ["domain", "category", "dataset", "metrics"]
-            if not all(field in result_entry for field in required_fields):
-                continue
-            # Create submission entry
-            submission = {
-                "model": model_name,
-                "submitter": submitter_name,
-                "submission_date": datetime.now().strftime("%Y-%m-%d"),
-                "metrics": result_entry["metrics"],
-                "task": "multivariate_forecasting",
-                "domain": result_entry["domain"],
-                "category": result_entry["category"],
-                "dataset": result_entry["dataset"],
-                "dataset_version": result_entry.get("dataset_version", "v1.0"),
-                "paper_url": paper_url,
-                "code_url": code_url
-            }
-            # Check for duplicates
-            is_duplicate = any(
-                r["model"].lower() == model_name.lower() and
-                r["domain"] == result_entry["domain"] and
-                r["category"] == result_entry["category"] and
-                r["dataset"] == result_entry["dataset"]
-                for r in existing_results
-            )
-            if not is_duplicate:
-                existing_results.append(submission)
-                added_count += 1
-        if added_count > 0:
-            save_results(existing_results)
-            return f"✅ Successfully submitted {added_count} result entries for model '{model_name}'!"
-        else:
-            return "❌ No new results were added. Check for duplicates or invalid data."
-    except json.JSONDecodeError:
-        return "❌ Invalid JSON format in bulk results data!"
-    except Exception as e:
-        return f"❌ Error submitting bulk results: {str(e)}"
-# Create the Gradio interface
-with gr.Blocks(title="MUSED-FM Leaderboard", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🏆 MUSED-FM Leaderboard
-    Welcome to the MUSED-FM (Multivariate Time Series Dataset) Leaderboard! This leaderboard tracks the performance of different models on multivariate time series forecasting tasks across various domains and datasets.
-    ## 📊 Evaluation Metrics
-    - **MAE**: Mean Absolute Error (lower is better)
-    - **Uni-MAE**: Univariate Mean Absolute Error (lower is better)
-    - **RMSE**: Root Mean Square Error (lower is better)
-    - **MAPE**: Mean Absolute Percentage Error (lower is better)
-    - **R²**: Coefficient of Determination (higher is better)
-    - **SMAPE**: Symmetric Mean Absolute Percentage Error (lower is better)
-    - **Uni-Multi**: Univariate-Multivariate comparison metric (lower is better)
-    ## 🎯 Tasks
-    - **multivariate_forecasting**: Multivariate time series forecasting
-    ## 🌐 Domains & Categories
-    - **Causal Model** (synthetic): Synthetic causal modeling datasets
-    - **Dynamic** (synthetic): Dynamic system datasets
-    - **Energy** (traditional): Energy consumption and production
-    - **Engineering** (traditional): Engineering sensor data
-    - **Environment** (traditional): Environmental monitoring
-    - **Finance** (traditional): Financial time series
-    - **Health** (traditional): Medical and health data
-    - **Image** (sequential): Image-based time series
-    - **Public Info** (traditional): Public information datasets
-    - **Sales** (traditional): Sales and pricing data
-    - **Scientific** (sequential): Scientific simulation data
-    - **Stock** (collections): Stock market data
-    - **Text** (sequential): Text-based time series
-    - **Video** (sequential): Video-based time series
-    - **Web** (traditional): Web analytics data
-    - **Wikipedia** (collections): Wikipedia usage data
-    """)
-    with gr.Tab("📈 Overall Leaderboard"):
         with gr.Row():
-            domain_filter = gr.Dropdown(
-                choices=get_domains(),
-                value="all",
-                label="Filter by Domain",
-                interactive=True
-            )
-            category_filter = gr.Dropdown(
-                choices=get_categories(),
-                value="all",
-                label="Filter by Category",
-                interactive=True
-            )
-            dataset_filter = gr.Dropdown(
-                choices=get_datasets(),
-                value="all",
-                label="Filter by Dataset",
-                interactive=True
-            )
-        leaderboard_df = gr.Dataframe(
-            value=update_leaderboard_overall(),
-            headers=["Rank", "Model", "Submitter", "Submission Date", "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi", "Task", "Domain", "Category", "Dataset", "Dataset Version"],
-            datatype=["number", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str"],
-            interactive=False,
-            label="MUSED-FM Overall Leaderboard"
-        )
-        refresh_btn = gr.Button("🔄 Refresh Leaderboard", variant="secondary")
-        refresh_btn.click(
-            fn=lambda d, c, ds: update_leaderboard_overall(d, c, ds),
-            inputs=[domain_filter, category_filter, dataset_filter],
-            outputs=leaderboard_df
-        )
-    with gr.Tab("🏢 By Domain"):
-        gr.Markdown("### Performance by Domain")
-        domain_leaderboard = gr.Dataframe(
-            value=update_leaderboard_by_domain(),
-            headers=["Rank", "Model", "Submitter", "Submission Date", "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi", "Task", "Category", "Dataset", "Dataset Version"],
-            datatype=["number", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str"],
-            interactive=False,
-            label="Domain-Specific Leaderboard"
-        )
-        domain_refresh_btn = gr.Button("🔄 Refresh Domain Leaderboard", variant="secondary")
-        domain_refresh_btn.click(fn=update_leaderboard_by_domain, outputs=domain_leaderboard)
-    with gr.Tab("📂 By Category"):
-        gr.Markdown("### Performance by Category")
-        category_leaderboard = gr.Dataframe(
-            value=update_leaderboard_by_category(),
-            headers=["Rank", "Model", "Submitter", "Submission Date", "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi", "Task", "Domain", "Dataset", "Dataset Version"],
-            datatype=["number", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str"],
-            interactive=False,
-            label="Category-Specific Leaderboard"
         )
-        category_refresh_btn = gr.Button("🔄 Refresh Category Leaderboard", variant="secondary")
-        category_refresh_btn.click(fn=update_leaderboard_by_category, outputs=category_leaderboard)
-    with gr.Tab("📊 By Dataset"):
-        gr.Markdown("### Performance by Dataset")
-        dataset_leaderboard = gr.Dataframe(
-            value=update_leaderboard_by_dataset(),
-            headers=["Rank", "Model", "Submitter", "Submission Date", "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi", "Task", "Domain", "Category", "Dataset Version"],
-            datatype=["number", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str"],
-            interactive=False,
-            label="Dataset-Specific Leaderboard"
         )
-        dataset_refresh_btn = gr.Button("🔄 Refresh Dataset Leaderboard", variant="secondary")
-        dataset_refresh_btn.click(fn=update_leaderboard_by_dataset, outputs=dataset_leaderboard)
-    with gr.Tab("📝 Submit Model"):
-        gr.Markdown("### Submit Your Model Results")
-        with gr.Row():
-            with gr.Column():
-                model_name = gr.Textbox(label="Model Name", placeholder="e.g., MyTimeSeriesModel")
-                submitter_name = gr.Textbox(label="Submitter Name", placeholder="Your name or organization")
-                gr.Markdown("### Performance Metrics")
-                mae = gr.Number(label="MAE (Mean Absolute Error)", precision=3)
-                uni_mae = gr.Number(label="Uni-MAE (Univariate MAE)", precision=3)
-                rmse = gr.Number(label="RMSE (Root Mean Square Error)", precision=3)
-                mape = gr.Number(label="MAPE (Mean Absolute Percentage Error)", precision=1)
-                r2 = gr.Number(label="R² (Coefficient of Determination)", precision=3)
-                smape = gr.Number(label="SMAPE (Symmetric MAPE)", precision=1)
-                uni_multi = gr.Number(label="Uni-Multi (Univariate-Multivariate)", precision=3)
-            with gr.Column():
-                task = gr.Dropdown(
-                    choices=["multivariate_forecasting"],
-                    value="multivariate_forecasting",
-                    label="Task"
-                )
-                domain = gr.Dropdown(
-                    choices=["Causal Model", "Dynamic", "Energy", "Engineering", "Environment", "Finance", "Health", "Image", "Public Info", "Sales", "Scientific", "Stock", "Text", "Video", "Web", "Wikipedia"],
-                    value="Energy",
-                    label="Domain"
-                )
-                category = gr.Dropdown(
-                    choices=["synthetic", "traditional", "sequential", "collections"],
-                    value="traditional",
-                    label="Category"
-                )
-                dataset = gr.Textbox(label="Dataset Name", placeholder="e.g., ecl, fred_md1, large_convlag_synin_s")
-                dataset_version = gr.Textbox(label="Dataset Version", value="v1.0")
-                paper_url = gr.Textbox(label="Paper URL (optional)", placeholder="https://arxiv.org/abs/...")
-                code_url = gr.Textbox(label="Code URL (optional)", placeholder="https://github.com/...")
-                submit_btn = gr.Button("🚀 Submit Model", variant="primary")
-                submission_status = gr.Textbox(label="Submission Status", interactive=False)
-        submit_btn.click(
-            fn=submit_model,
-            inputs=[model_name, submitter_name, mae, uni_mae, rmse, mape, r2, smape, uni_multi, task, domain, category, dataset, dataset_version, paper_url, code_url],
-            outputs=submission_status
         )
-    with gr.Tab("📦 Bulk Submit"):
-        gr.Markdown("### Bulk Submit Results for Multiple Domain/Dataset Combinations")
-        gr.Markdown("""
-        **Format**: Submit a JSON array of results. Each entry should contain:
-        ```json
-        [
-          {
-            "domain": "Energy",
-            "category": "traditional",
-            "dataset": "ecl",
-            "dataset_version": "v1.0",
-            "metrics": {
-              "MAE": 10.0,
-              "Uni-MAE": 20.0,
-              "RMSE": 10.0,
-              "MAPE": 10.0,
-              "R²": 10.0,
-              "SMAPE": 10.0,
-              "Uni-Multi": 10.0
-            }
-          }
-        ]
-        ```
-        """)
-        bulk_model_name = gr.Textbox(label="Model Name", placeholder="e.g., MyTimeSeriesModel")
-        bulk_submitter_name = gr.Textbox(label="Submitter Name", placeholder="Your name or organization")
-        bulk_results_data = gr.Textbox(
-            label="Bulk Results Data (JSON)",
-            placeholder="Paste your JSON array here...",
-            lines=10
         )
-        bulk_paper_url = gr.Textbox(label="Paper URL (optional)", placeholder="https://arxiv.org/abs/...")
-        bulk_code_url = gr.Textbox(label="Code URL (optional)", placeholder="https://github.com/...")
-        bulk_submit_btn = gr.Button("📦 Submit Bulk Results", variant="primary")
-        bulk_submission_status = gr.Textbox(label="Bulk Submission Status", interactive=False)
-        bulk_submit_btn.click(
-            fn=submit_bulk_results,
-            inputs=[bulk_model_name, bulk_submitter_name, bulk_results_data, bulk_paper_url, bulk_code_url],
-            outputs=bulk_submission_status
         )
-    with gr.Tab("📋 Dataset Info"):
-        gr.Markdown("""
-        ## MUSED-FM Dataset Information
-        ### Overview
-        MUSED-FM is a comprehensive multivariate time series dataset designed for forecasting tasks. The dataset contains multiple time series with various characteristics and complexities.
-        ### Dataset Characteristics
-        - **Type**: Multivariate Time Series
-        - **Domain**: General forecasting tasks
-        - **Features**: Multiple variables per time series
-        - **Temporal Resolution**: Various (hourly, daily, etc.)
-        ### Evaluation Protocol
-        1. Models are evaluated on held-out test sets
-        2. Standard train/validation/test splits are provided
-        3. Multiple evaluation metrics are used for comprehensive assessment
-        4. Results should be reproducible and include proper citations
-        ### Submission Guidelines
-        - Provide accurate performance metrics
-        - Include links to papers and code when available
-        - Ensure reproducibility of results
-        - Follow ethical AI practices
-        ### Contact
-        For questions about the dataset or leaderboard, please contact the maintainers.
-        """)
-    with gr.Tab("📊 Statistics"):
-        gr.Markdown("### Leaderboard Statistics")
-        def get_stats():
-            results = load_results()
-            if not results:
-                return "No submissions yet."
-            total_models = len(results)
-            avg_mae = np.mean([r["metrics"]["MAE"] for r in results])
-            avg_rmse = np.mean([r["metrics"]["RMSE"] for r in results])
-            avg_r2 = np.mean([r["metrics"]["R²"] for r in results])
-            best_mae = min([r["metrics"]["MAE"] for r in results])
-            best_r2 = max([r["metrics"]["R²"] for r in results])
-            stats_text = f"""
-            **Total Submissions**: {total_models}
-            **Average Performance**:
-            - MAE: {avg_mae:.3f}
-            - RMSE: {avg_rmse:.3f}
-            - R²: {avg_r2:.3f}
-            **Best Performance**:
-            - Best MAE: {best_mae:.3f}
-            - Best R²: {best_r2:.3f}
-            """
-            return stats_text
-        stats_display = gr.Markdown(value=get_stats())
-        refresh_stats_btn = gr.Button("🔄 Refresh Statistics")
-        refresh_stats_btn.click(fn=get_stats, outputs=stats_display)
 if __name__ == "__main__":
-    demo.launch()

+"""
+MUSED-FM Leaderboard - Main Gradio Application
+Following GIFT-Eval import structure with custom layout
+"""
 import gradio as gr
 import pandas as pd
+# Optional imports for production features
+try:
+    from apscheduler.schedulers.background import BackgroundScheduler
+    SCHEDULER_AVAILABLE = True
+except ImportError:
+    SCHEDULER_AVAILABLE = False
+    print("Warning: apscheduler not available, scheduler features disabled")
+try:
+    from huggingface_hub import snapshot_download
+    HUB_AVAILABLE = True
+except ImportError:
+    HUB_AVAILABLE = False
+    print("Warning: huggingface_hub not available, hub features disabled")
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    ModelInfoColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision
+)
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_info_df, get_merged_df
+from src.utils import norm_sNavie, pivot_df, get_grouped_dfs, pivot_existed_df, rename_metrics, format_df
+from src.load_results import (
+    load_results_with_metadata,
+    create_overall_table,
+    get_filter_options,
+    get_model_metadata,
+    create_model_metadata_display,
+    get_overall_summary
+)
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+def create_leaderboard_interface():
+    """Create the main leaderboard interface"""
+    demo = gr.Blocks(css=custom_css)
+    with demo:
+        gr.HTML(TITLE)
+        # Minimizable description section
+        with gr.Accordion("📖 Description", open=False):
+            gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+        # Get filter options
+        filter_options = get_filter_options()
+        # Main content area
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Individual minimizable filter sections
+                with gr.Accordion("🔍 Model Search", open=False):
+                    model_search = gr.Textbox(
+                        label="Model Search",
+                        placeholder="Search for a specific model...",
+                        info="Type part of a model name to filter"
+                    )
+                with gr.Accordion("📂 Category Filter", open=False):
+                    category_radio = gr.Radio(
+                        choices=filter_options["categories"],
+                        value="all",
+                        label="Category",
+                        info="Filter by category"
+                    )
+                with gr.Accordion("🌐 Domain Filter", open=False):
+                    domain_radio = gr.Radio(
+                        choices=filter_options["domains"],
+                        value="all",
+                        label="Domain",
+                        info="Filter by domain"
+                    )
+                with gr.Accordion("📊 Dataset Filter", open=False):
+                    dataset_radio = gr.Radio(
+                        choices=filter_options["datasets"],
+                        value="all",
+                        label="Dataset",
+                        info="Filter by dataset"
+                    )
+                clear_filters_btn = gr.Button("🗑️ Clear All Filters", variant="secondary")
+            with gr.Column(scale=3):
+                gr.Markdown("### 📋 Model Rankings")
+                # Main results table
+                results_table = gr.Dataframe(
+                    value=create_overall_table(),
+                    headers=["Rank", "Model", "Organization", "Datasets", "Domains", "Categories",
+                            "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi", "Submission Date"],
+                    datatype=["number", "str", "str", "number", "number", "number",
+                            "str", "str", "str", "str", "str", "str", "str", "str"],
+                    interactive=False,
+                    label="Overall Rankings",
+                    wrap=True,
+                    elem_classes=["elegant-table"]
+                )
+                refresh_btn = gr.Button("🔄 Refresh Table", variant="primary")
+        # Model metadata section at bottom
+        with gr.Accordion("🔍 Model Inspector", open=False):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    model_selector = gr.Dropdown(
+                        choices=filter_options["models"],
+                        value=None,
+                        label="Select Model",
+                        info="Choose a model to view its metadata",
+                        allow_custom_value=False
+                    )
+                with gr.Column(scale=3):
+                    metadata_display = gr.Markdown(
+                        value="Select a model to view its metadata.",
+                        label="Model Metadata"
+                    )
+        # Summary statistics section
         with gr.Row():
+            with gr.Column():
+                gr.Markdown("### 📈 Summary Statistics")
+                summary_text = gr.Markdown(value=get_overall_summary())
+        # About section
+        with gr.Tabs():
+            with gr.Tab("📖 About"):
+                gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        # Citation section
+        with gr.Row():
+            with gr.Accordion("📙 Citation", open=False):
+                citation_button = gr.Textbox(
+                    value=CITATION_BUTTON_TEXT,
+                    label=CITATION_BUTTON_LABEL,
+                    lines=20,
+                    elem_id="citation-button",
+                    show_copy_button=True,
+                )
+        # Event handlers
+        def update_table(domain, category, dataset, model):
+            return create_overall_table(domain, category, dataset, model)
+        def clear_filters():
+            return "all", "all", "all", ""
+        # Connect filters to table updates
+        domain_radio.change(
+            fn=update_table,
+            inputs=[domain_radio, category_radio, dataset_radio, model_search],
+            outputs=results_table
         )
+        category_radio.change(
+            fn=update_table,
+            inputs=[domain_radio, category_radio, dataset_radio, model_search],
+            outputs=results_table
         )
+        dataset_radio.change(
+            fn=update_table,
+            inputs=[domain_radio, category_radio, dataset_radio, model_search],
+            outputs=results_table
         )
+        model_search.change(
+            fn=update_table,
+            inputs=[domain_radio, category_radio, dataset_radio, model_search],
+            outputs=results_table
         )
+        refresh_btn.click(
+            fn=update_table,
+            inputs=[domain_radio, category_radio, dataset_radio, model_search],
+            outputs=results_table
         )
+        clear_filters_btn.click(
+            fn=clear_filters,
+            outputs=[domain_radio, category_radio, dataset_radio, model_search]
+        )
+        # Model selector event handler
+        model_selector.change(
+            fn=create_model_metadata_display,
+            inputs=[model_selector],
+            outputs=[metadata_display]
+        )
+    return demo
+# Start scheduler if available
+if SCHEDULER_AVAILABLE:
+    scheduler = BackgroundScheduler()
+    scheduler.start()
+else:
+    scheduler = None
+# Launch the demo
 if __name__ == "__main__":
+    demo = create_leaderboard_interface()
+    demo.queue(default_concurrency_limit=40).launch()

demo.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+MUSED-FM Leaderboard - Local Demo
+Imports from app.py to ensure identical functionality, loads a local demo leaderboard
+"""
+import gradio as gr
+import pandas as pd
+import json
+import os
+from datetime import datetime
+from typing import Dict, List, Any
+import numpy as np
+# Import from our src package
+from src.load_results import (
+    load_results_with_metadata,
+    create_overall_table,
+    get_filter_options,
+    get_model_metadata,
+    create_model_metadata_display,
+    get_overall_summary
+)
+# Import the main interface function from app.py
+from app import create_leaderboard_interface
+# Create the demo using the same function as app.py
+demo = create_leaderboard_interface()
+# Launch the demo
+if __name__ == "__main__":
+    print("🎨 MUSED-FM Leaderboard Local Demo")
+    print("=" * 50)
+    try:
+        print("📊 Loading data...")
+        results = load_results_with_metadata()
+        print(f"✅ Loaded {len(results)} results")
+        print("🏗️ Creating interface...")
+        print("🚀 Starting local leaderboard...")
+        print("📊 Access at: http://localhost:7860")
+        print("🔄 Press Ctrl+C to stop")
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=False,
+            show_error=True,
+            quiet=False
+        )
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()

requirements.txt CHANGED Viewed

@@ -1,4 +1,8 @@
 gradio==5.49.0
 pandas>=1.5.0
 numpy>=1.21.0
 json5>=0.9.0

 gradio==5.49.0
+gradio-leaderboard
 pandas>=1.5.0
 numpy>=1.21.0
+plotly
+apscheduler
+huggingface-hub
 json5>=0.9.0

requirements_local.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio>=4.0.0
+pandas>=1.5.0
+numpy>=1.21.0

results/sample_submission/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "model": "EXAMPLE",
+    "submitter": "Research Team",
+    "submission_date": "2025-10-09",
+    "task": "multivariate_forecasting",
+    "dataset_version": "v1.0",
+    "paper_url": "https://example.com/paper1",
+    "code_url": "https://github.com/example/repo1"
+}

sample_bulk_submission.json → results/sample_submission/sample_bulk_submission.json RENAMED Viewed

File without changes

results/sample_submission2/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "model": "EXAMPLE2",
+    "submitter": "Research Team",
+    "submission_date": "2025-10-09",
+    "task": "multivariate_forecasting",
+    "dataset_version": "v1.0",
+    "paper_url": "https://example.com/paper2",
+    "code_url": "https://github.com/example/repo2"
+}

results/sample_submission2/results.json ADDED Viewed

	@@ -0,0 +1,1292 @@

+[
+    {
+      "domain": "Causal Model",
+      "category": "synthetic",
+      "dataset": "large_convlag_synin_s",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Causal Model",
+      "category": "synthetic",
+      "dataset": "medium_convlag_synin_s",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Causal Model",
+      "category": "synthetic",
+      "dataset": "medium_obslag_synin_s",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Causal Model",
+      "category": "synthetic",
+      "dataset": "tiny_convlag_synin_ns",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Causal Model",
+      "category": "synthetic",
+      "dataset": "tiny_obslag_synin_ns",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Dynamic",
+      "category": "synthetic",
+      "dataset": "dynamic_data_csvs",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "al_daily",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "aus_electricity_nsw",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "aus_electricity_qld",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "az_daily",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "az_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "cal_daily",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "cal_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "car_daily",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "car_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "central_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "co_daily",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "eastern_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "ecl",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "ercot_load",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "fl_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "id_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "mds_microgrid",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "ne_daily",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "ne_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "nm_daily",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "northern_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "ny_daily",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "ny_electricity2525",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "or_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "pa_daily",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "pa_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "se_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "solar_alabama",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "southern_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "tn_daily",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "tn_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "tx_daily",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "tx_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Energy",
+      "category": "traditional",
+      "dataset": "western_electricity",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Engineering",
+      "category": "traditional",
+      "dataset": "ev-sensors",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Engineering",
+      "category": "traditional",
+      "dataset": "voip",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Environment",
+      "category": "traditional",
+      "dataset": "beijing_aq",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Environment",
+      "category": "traditional",
+      "dataset": "beijing_embassy",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Environment",
+      "category": "traditional",
+      "dataset": "causalrivers",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Environment",
+      "category": "traditional",
+      "dataset": "gas_sensor",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Environment",
+      "category": "traditional",
+      "dataset": "oikolab_weather",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Environment",
+      "category": "traditional",
+      "dataset": "open_aq",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Environment",
+      "category": "traditional",
+      "dataset": "weather_mpi",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Finance",
+      "category": "traditional",
+      "dataset": "fred_md1",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Finance",
+      "category": "traditional",
+      "dataset": "fred_md2",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Finance",
+      "category": "traditional",
+      "dataset": "fred_md3",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Finance",
+      "category": "traditional",
+      "dataset": "fred_md4",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Finance",
+      "category": "traditional",
+      "dataset": "fred_md5",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Finance",
+      "category": "traditional",
+      "dataset": "fred_md6",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Finance",
+      "category": "traditional",
+      "dataset": "fred_md7",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Finance",
+      "category": "traditional",
+      "dataset": "fred_md8",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Health",
+      "category": "traditional",
+      "dataset": "cgm",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Health",
+      "category": "traditional",
+      "dataset": "sleep_lab",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Image",
+      "category": "sequential",
+      "dataset": "cifar150_timeseries_csvs",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Public Info",
+      "category": "traditional",
+      "dataset": "austin_water",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Public Info",
+      "category": "traditional",
+      "dataset": "blue_bikes",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Public Info",
+      "category": "traditional",
+      "dataset": "cursor-tabs",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Public Info",
+      "category": "traditional",
+      "dataset": "mn_interstate",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Public Info",
+      "category": "traditional",
+      "dataset": "mta_ridership",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Public Info",
+      "category": "traditional",
+      "dataset": "paris_mobility",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Public Info",
+      "category": "traditional",
+      "dataset": "lyft",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Public Info",
+      "category": "traditional",
+      "dataset": "uber",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Public Info",
+      "category": "traditional",
+      "dataset": "tac",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Public Info",
+      "category": "traditional",
+      "dataset": "traffic_PeMS",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Sales",
+      "category": "traditional",
+      "dataset": "bitcoin_price",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Sales",
+      "category": "traditional",
+      "dataset": "blow_molding",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Sales",
+      "category": "traditional",
+      "dataset": "gold_prices",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Sales",
+      "category": "traditional",
+      "dataset": "pasta_sales",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Sales",
+      "category": "traditional",
+      "dataset": "rice_prices",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Sales",
+      "category": "traditional",
+      "dataset": "walmart-sales",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Scientific",
+      "category": "sequential",
+      "dataset": "ant_csv_out",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Scientific",
+      "category": "sequential",
+      "dataset": "hopper_csv_out",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Scientific",
+      "category": "sequential",
+      "dataset": "cheetah_csv_out",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Scientific",
+      "category": "sequential",
+      "dataset": "walker2d_csv_out",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Scientific",
+      "category": "sequential",
+      "dataset": "spriteworld",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Stock",
+      "category": "collections",
+      "dataset": "stock_nasdaqtrader",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Text",
+      "category": "sequential",
+      "dataset": "openwebtext_timeseries_csvs",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Video",
+      "category": "sequential",
+      "dataset": "KITTI",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Web",
+      "category": "traditional",
+      "dataset": "website_visitors",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    },
+    {
+      "domain": "Wikipedia",
+      "category": "collections",
+      "dataset": "wikipedia",
+      "dataset_version": "v1.0",
+      "metrics": {
+        "MAE": 15.0,
+        "Uni-MAE": 25.0,
+        "RMSE": 15.0,
+        "MAPE": 15.0,
+        "R\u00b2": 15.0,
+        "SMAPE": 15.0,
+        "Uni-Multi": 15.0
+      }
+    }
+  ]

src/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+MUSED-FM Leaderboard source package
+"""
+from .load_results import (
+    load_results_with_metadata,
+    create_overall_table,
+    get_filter_options,
+    get_model_metadata,
+    create_model_metadata_display,
+    get_overall_summary
+)
+__all__ = [
+    "load_results_with_metadata",
+    "create_overall_table",
+    "get_filter_options",
+    "get_model_metadata",
+    "create_model_metadata_display",
+    "get_overall_summary"
+]

src/about.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+Text constants for MUSED-FM Leaderboard
+"""
+TITLE = """
+<div style="text-align: center;">
+    <h1>📊 MUSED-FM Leaderboard</h1>
+    <p style="font-size: 18px; color: #666;">Multivariate Time Series Evaluation Dataset for Foundation Models</p>
+</div>
+"""
+INTRODUCTION_TEXT = """
+Welcome to the **MUSED-FM Leaderboard**! This leaderboard provides comprehensive evaluation results for foundation models on multivariate time series forecasting tasks.
+**MUSED-FM** spans 16 multivariate time series domains and introduces novel synthetic data techniques, comprising 67 billion data points and 2.6 million time series.
+### Key Features:
+- **Scale**: 67 billion data points across 2.6 million time series
+- **Domains**: 16 multivariate time series domains
+- **Innovation**: Novel synthetic data techniques
+- **Evaluation**: Comprehensive metrics including MAE, RMSE, MAPE, R², SMAPE, Uni-MAE, and Uni-Multi
+### Dataset Structure:
+- **Categories**: Traditional, Sequential, Synthetic, Collections
+- **Domains**: Finance, Health, Energy, Environment, Engineering, and more
+- **Datasets**: 86+ individual time series datasets
+Use the filters below to explore results by different criteria and compare model performance across various domains and categories.
+"""
+LLM_BENCHMARKS_TEXT = """
+# About MUSED-FM Leaderboard
+## Dataset Overview
+**MUSED-FM** (Multivariate Time Series Evaluation Dataset for Foundation Models) is a comprehensive benchmark for evaluating foundation models on multivariate time series forecasting tasks.
+### Key Features:
+- **Scale**: 67 billion data points across 2.6 million time series
+- **Domains**: 16 multivariate time series domains
+- **Innovation**: Novel synthetic data techniques
+- **Evaluation**: Comprehensive metrics including MAE, RMSE, MAPE, R², SMAPE, Uni-MAE, and Uni-Multi
+### Dataset Structure:
+- **Categories**: Traditional, Sequential, Synthetic, Collections
+- **Domains**: Finance, Health, Energy, Environment, Engineering, and more
+- **Datasets**: 86+ individual time series datasets
+## Evaluation Metrics
+### Standard Metrics:
+- **MAE** (Mean Absolute Error): Average absolute difference between predicted and actual values
+- **RMSE** (Root Mean Square Error): Square root of average squared differences
+- **MAPE** (Mean Absolute Percentage Error): Average percentage error
+- **R²** (Coefficient of Determination): Proportion of variance explained
+- **SMAPE** (Symmetric Mean Absolute Percentage Error): Symmetric percentage error
+### Novel Metrics:
+- **Uni-MAE**: Unified MAE metric for cross-dataset comparison
+- **Uni-Multi**: Unified multivariate metric for comprehensive evaluation
+## Resources
+### Dataset Access:
+- **Hugging Face**: [MUSED-FM Dataset](https://huggingface.co/datasets/Synthefy/MUSED-FM)
+- **GitHub Repository**: [MUSED-FM Code](https://github.com/Synthefy/MUSED-FM)
+### Citation:
+If you use MUSED-FM in your research, please cite the original paper:
+```bibtex
+@article{mused-fm2024,
+    title={MUSED-FM: A Multivariate Time Series Evaluation Dataset for Foundation Models},
+    author={Synthefy Research Team},
+    journal={arXiv preprint},
+    year={2024}
+}
+```
+## Contact & Support
+For questions about the dataset or leaderboard:
+- **Issues**: Report issues on the [GitHub repository](https://github.com/Synthefy/MUSED-FM)
+- **Discussions**: Join discussions on [Hugging Face](https://huggingface.co/datasets/Synthefy/MUSED-FM)
+## Leaderboard Information
+This leaderboard provides:
+- **Real-time Rankings**: Live updates as new submissions are received
+- **Filtered Views**: Explore results by domain, category, and dataset
+- **Model Inspector**: Detailed metadata for each submitted model
+- **Comprehensive Metrics**: Multiple evaluation perspectives
+The leaderboard aggregates results across all datasets to provide overall model rankings while maintaining the ability to drill down into specific domains and categories.
+"""
+CITATION_BUTTON_LABEL = "📋 Citation"
+CITATION_BUTTON_TEXT = """@article{mused-fm2024,
+    title={MUSED-FM: A Multivariate Time Series Evaluation Dataset for Foundation Models},
+    author={Synthefy Research Team},
+    journal={arXiv preprint},
+    year={2024}
+}"""
+EVALUATION_QUEUE_TEXT = """
+## Evaluation Queue
+This section shows the current status of model evaluations in the queue.
+"""

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+CSS and styling for MUSED-FM Leaderboard
+"""
+custom_css = """
+/* Custom styling for MUSED-FM Leaderboard */
+.elegant-table {
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+.markdown-text {
+    font-size: 14px;
+    line-height: 1.6;
+}
+.tab-buttons {
+    margin-top: 20px;
+}
+#citation-button {
+    font-family: 'Courier New', monospace;
+    font-size: 12px;
+}
+"""

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Display utilities and column definitions for MUSED-FM Leaderboard
+"""
+from dataclasses import dataclass
+from typing import List, Dict, Any
+from enum import Enum
+# Column definitions for model information
+@dataclass
+class ModelInfoColumn:
+    name: str
+    type: str = "str"
+    displayed_by_default: bool = True
+    never_hidden: bool = False
+    hidden: bool = False
+# Model information columns
+model_info_columns = [
+    ModelInfoColumn("model", "str", True, True, False),
+    ModelInfoColumn("organization", "str", True, False, False),
+    ModelInfoColumn("submission_date", "str", True, False, False),
+    ModelInfoColumn("task", "str", True, False, False),
+    ModelInfoColumn("dataset_version", "str", True, False, False),
+    ModelInfoColumn("paper_url", "str", False, False, False),
+    ModelInfoColumn("code_url", "str", False, False, False),
+    ModelInfoColumn("domains", "number", True, False, False),
+    ModelInfoColumn("categories", "number", True, False, False),
+    ModelInfoColumn("datasets", "number", True, False, False),
+]
+# Benchmark columns (metrics)
+BENCHMARK_COLS = [
+    "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi"
+]
+# Evaluation columns
+EVAL_COLS = [
+    "model", "submitter", "submission_date", "domain", "category", "dataset",
+    "task", "dataset_version", "paper_url", "code_url"
+]
+# Evaluation types
+EVAL_TYPES = ["multivariate_forecasting"]
+# Model types
+class ModelType(Enum):
+    FOUNDATION = "Foundation Model"
+    TRADITIONAL = "Traditional"
+    NEURAL = "Neural Network"
+    TRANSFORMER = "Transformer"
+# Weight types
+class WeightType(Enum):
+    LIGHTWEIGHT = "Lightweight"
+    MEDIUM = "Medium"
+    HEAVY = "Heavy"
+# Precision types
+class Precision(Enum):
+    FLOAT16 = "FP16"
+    FLOAT32 = "FP32"
+    MIXED = "Mixed"
+# Fields function for dataclass
+def fields(cls):
+    """Get fields from dataclass"""
+    return cls.__dataclass_fields__.values() if hasattr(cls, '__dataclass_fields__') else []

src/envs.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+Environment configuration for MUSED-FM Leaderboard
+"""
+import os
+# API configuration
+class API:
+    @staticmethod
+    def restart_space(repo_id: str):
+        """Restart space functionality"""
+        print(f"Restarting space: {repo_id}")
+# Repository configuration
+REPO_ID = "mused-fm-leaderboard"
+QUEUE_REPO = "mused-fm-queue"
+RESULTS_REPO = "mused-fm-results"
+# Paths
+EVAL_REQUESTS_PATH = "eval_requests"
+EVAL_RESULTS_PATH = "results"
+# Token (placeholder)
+TOKEN = os.getenv("HF_TOKEN", "")

src/load_results.py ADDED Viewed

	@@ -0,0 +1,285 @@

+"""
+Data loading utilities for MUSED-FM Leaderboard
+"""
+import json
+import os
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Any
+def load_results_with_metadata() -> List[Dict]:
+    """Load results from results directory using metadata.json files"""
+    all_results = []
+    # First, try to load from results.json (user submissions)
+    results_file = "results.json"
+    if os.path.exists(results_file):
+        with open(results_file, 'r') as f:
+            data = json.load(f)
+            return data.get("results", [])
+    # Load from results directory with metadata support
+    results_dir = "results"
+    if os.path.exists(results_dir):
+        for item in os.listdir(results_dir):
+            item_path = os.path.join(results_dir, item)
+            if os.path.isdir(item_path):
+                # Look for metadata.json in each submission folder
+                metadata_path = os.path.join(item_path, "metadata.json")
+                results_path = None
+                # Find the results file (could be results.json, sample_bulk_submission.json, etc.)
+                for file in os.listdir(item_path):
+                    if file.endswith('.json') and file != 'metadata.json':
+                        results_path = os.path.join(item_path, file)
+                        break
+                if os.path.exists(metadata_path) and results_path and os.path.exists(results_path):
+                    try:
+                        # Load metadata
+                        with open(metadata_path, 'r') as f:
+                            metadata = json.load(f)
+                        # Load results
+                        with open(results_path, 'r') as f:
+                            results_data = json.load(f)
+                        # Process each result entry
+                        for result in results_data:
+                            # Override with metadata information
+                            result["model"] = metadata.get("model", result.get("model", ""))
+                            result["submitter"] = metadata.get("submitter", result.get("submitter", ""))
+                            result["submission_date"] = metadata.get("submission_date", result.get("submission_date", ""))
+                            result["task"] = metadata.get("task", result.get("task", ""))
+                            result["dataset_version"] = metadata.get("dataset_version", result.get("dataset_version", ""))
+                            result["paper_url"] = metadata.get("paper_url", result.get("paper_url", ""))
+                            result["code_url"] = metadata.get("code_url", result.get("code_url", ""))
+                            all_results.append(result)
+                    except Exception as e:
+                        print(f"Error loading {item_path}: {e}")
+                        continue
+    return all_results
+def create_overall_table(domain_filter="all", category_filter="all", dataset_filter="all", model_filter=""):
+    """Create overall aggregated table with optional filters"""
+    results = load_results_with_metadata()
+    if not results:
+        return pd.DataFrame()
+    # Apply filters
+    filtered_results = []
+    for result in results:
+        # Domain filter
+        if domain_filter != "all" and result.get("domain", "") != domain_filter:
+            continue
+        # Category filter
+        if category_filter != "all" and result.get("category", "") != category_filter:
+            continue
+        # Dataset filter
+        if dataset_filter != "all" and result.get("dataset", "") != dataset_filter:
+            continue
+        # Model filter (case-insensitive partial match)
+        if model_filter and model_filter.lower() not in result.get("model", "").lower():
+            continue
+        filtered_results.append(result)
+    if not filtered_results:
+        return pd.DataFrame()
+    # Group by model and calculate aggregated metrics
+    model_stats = {}
+    for result in filtered_results:
+        model = result["model"]
+        if model not in model_stats:
+            model_stats[model] = {
+                "submitter": result["submitter"],
+                "submission_date": result["submission_date"],
+                "mae_values": [],
+                "uni_mae_values": [],
+                "rmse_values": [],
+                "mape_values": [],
+                "r2_values": [],
+                "smape_values": [],
+                "uni_multi_values": [],
+                "datasets": set(),
+                "domains": set(),
+                "categories": set(),
+                "paper_url": result.get("paper_url", ""),
+                "code_url": result.get("code_url", "")
+            }
+        metrics = result["metrics"]
+        model_stats[model]["mae_values"].append(metrics["MAE"])
+        model_stats[model]["uni_mae_values"].append(metrics.get("Uni-MAE", 0))
+        model_stats[model]["rmse_values"].append(metrics["RMSE"])
+        model_stats[model]["mape_values"].append(metrics["MAPE"])
+        model_stats[model]["r2_values"].append(metrics["R²"])
+        model_stats[model]["smape_values"].append(metrics["SMAPE"])
+        model_stats[model]["uni_multi_values"].append(metrics.get("Uni-Multi", 0))
+        model_stats[model]["datasets"].add(result.get("dataset", ""))
+        model_stats[model]["domains"].add(result.get("domain", ""))
+        model_stats[model]["categories"].add(result.get("category", ""))
+    # Create aggregated table
+    table_data = []
+    for model, stats in model_stats.items():
+        # Calculate aggregated metrics (arithmetic mean for better aggregation)
+        avg_mae = np.mean(stats["mae_values"])
+        avg_uni_mae = np.mean(stats["uni_mae_values"])
+        avg_rmse = np.mean(stats["rmse_values"])
+        avg_mape = np.mean(stats["mape_values"])
+        avg_r2 = np.mean(stats["r2_values"])
+        avg_smape = np.mean(stats["smape_values"])
+        avg_uni_multi = np.mean(stats["uni_multi_values"])
+        row = {
+            "Model": model,
+            "Organization": stats["submitter"],
+            "Datasets": len(stats["datasets"]),
+            "Domains": len(stats["domains"]),
+            "Categories": len(stats["categories"]),
+            "MAE": f"{avg_mae:.3f}",
+            "Uni-MAE": f"{avg_uni_mae:.3f}",
+            "RMSE": f"{avg_rmse:.3f}",
+            "MAPE": f"{avg_mape:.1f}%",
+            "R²": f"{avg_r2:.3f}",
+            "SMAPE": f"{avg_smape:.1f}%",
+            "Uni-Multi": f"{avg_uni_multi:.3f}",
+            "Submission Date": stats["submission_date"]
+        }
+        table_data.append(row)
+    # Sort by MAE and add ranks
+    table_data.sort(key=lambda x: float(x["MAE"]))
+    for i, row in enumerate(table_data):
+        row["Rank"] = i + 1
+    return pd.DataFrame(table_data)
+def get_filter_options():
+    """Get all available filter options"""
+    results = load_results_with_metadata()
+    if not results:
+        return {"domains": [], "categories": [], "datasets": [], "models": []}
+    domains = sorted(list(set([r.get("domain", "") for r in results if r.get("domain", "")])))
+    categories = sorted(list(set([r.get("category", "") for r in results if r.get("category", "")])))
+    datasets = sorted(list(set([r.get("dataset", "") for r in results if r.get("dataset", "")])))
+    models = sorted(list(set([r.get("model", "") for r in results if r.get("model", "")])))
+    return {
+        "domains": ["all"] + domains,
+        "categories": ["all"] + categories,
+        "datasets": ["all"] + datasets,
+        "models": models
+    }
+def get_model_metadata(model_name):
+    """Get metadata for a specific model"""
+    results = load_results_with_metadata()
+    if not results:
+        return None
+    # Find the first result for this model to get metadata
+    for result in results:
+        if result.get("model", "") == model_name:
+            return {
+                "model": result.get("model", ""),
+                "submitter": result.get("submitter", ""),
+                "submission_date": result.get("submission_date", ""),
+                "task": result.get("task", ""),
+                "dataset_version": result.get("dataset_version", ""),
+                "paper_url": result.get("paper_url", ""),
+                "code_url": result.get("code_url", ""),
+                "domains": sorted(list(set([r.get("domain", "") for r in results if r.get("model", "") == model_name and r.get("domain", "")]))),
+                "categories": sorted(list(set([r.get("category", "") for r in results if r.get("model", "") == model_name and r.get("category", "")]))),
+                "datasets": sorted(list(set([r.get("dataset", "") for r in results if r.get("model", "") == model_name and r.get("dataset", "")]))),
+                "total_evaluations": len([r for r in results if r.get("model", "") == model_name])
+            }
+    return None
+def create_model_metadata_display(selected_model):
+    """Create a markdown display for model metadata"""
+    if not selected_model:
+        return "Select a model to view its metadata."
+    metadata = get_model_metadata(selected_model)
+    if not metadata:
+        return f"❌ No metadata found for model: {selected_model}"
+    # Create clickable links
+    paper_link = f"[📄 Paper]({metadata['paper_url']})" if metadata['paper_url'] else "📄 Paper: Not provided"
+    code_link = f"[💻 Code]({metadata['code_url']})" if metadata['code_url'] else "💻 Code: Not provided"
+    metadata_text = f"""
+    ## 🔍 Model Metadata: {metadata['model']}
+    **Organization:** {metadata['submitter']}
+    **Submission Date:** {metadata['submission_date']}
+    **Task:** {metadata['task']}
+    **Dataset Version:** {metadata['dataset_version']}
+    **Links:**
+    {paper_link}
+    {code_link}
+    **Evaluation Coverage:**
+    - **Total Evaluations:** {metadata['total_evaluations']}
+    - **Domains:** {', '.join(metadata['domains']) if metadata['domains'] else 'None'}
+    - **Categories:** {', '.join(metadata['categories']) if metadata['categories'] else 'None'}
+    - **Datasets:** {', '.join(metadata['datasets'][:5])}{'...' if len(metadata['datasets']) > 5 else ''} ({len(metadata['datasets'])} total)
+    """
+    return metadata_text
+def get_overall_summary():
+    """Generate summary statistics for the overall view"""
+    overall_df = create_overall_table()
+    if overall_df.empty:
+        return "No data available."
+    total_models = len(overall_df)
+    total_datasets = overall_df['Datasets'].sum()
+    total_domains = overall_df['Domains'].sum()
+    total_categories = overall_df['Categories'].sum()
+    # Calculate average metrics
+    mae_values = [float(x) for x in overall_df['MAE']]
+    r2_values = [float(x) for x in overall_df['R²']]
+    avg_mae = np.mean(mae_values)
+    best_mae = min(mae_values)
+    avg_r2 = np.mean(r2_values)
+    best_r2 = max(r2_values)
+    stats_text = f"""
+    **Overall Summary:**
+    - Total Models: {total_models}
+    - Total Dataset Evaluations: {total_datasets}
+    - Total Domain Evaluations: {total_domains}
+    - Total Category Evaluations: {total_categories}
+    **Performance Metrics:**
+    - Average MAE: {avg_mae:.3f}
+    - Best MAE: {best_mae:.3f}
+    - Average R²: {avg_r2:.3f}
+    - Best R²: {best_r2:.3f}
+    """
+    return stats_text

src/populate.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+Data population functions for MUSED-FM Leaderboard
+"""
+import pandas as pd
+from typing import Dict, List, Any, Optional
+from .load_results import load_results_with_metadata, create_overall_table
+def get_leaderboard_df(results_path: str, requests_path: str, eval_cols: List[str], benchmark_cols: List[str]) -> pd.DataFrame:
+    """Get leaderboard dataframe"""
+    # Use our existing load_results function
+    results = load_results_with_metadata()
+    if not results:
+        return pd.DataFrame()
+    return create_overall_table()
+def get_model_info_df(results_path: str, requests_path: str) -> pd.DataFrame:
+    """Get model information dataframe"""
+    results = load_results_with_metadata()
+    if not results:
+        return pd.DataFrame()
+    # Extract unique model information
+    model_info = {}
+    for result in results:
+        model = result["model"]
+        if model not in model_info:
+            model_info[model] = {
+                "model": model,
+                "organization": result["submitter"],
+                "submission_date": result["submission_date"],
+                "task": result.get("task", ""),
+                "dataset_version": result.get("dataset_version", ""),
+                "paper_url": result.get("paper_url", ""),
+                "code_url": result.get("code_url", ""),
+                "model_type": "Foundation Model",  # Default
+                "testdata_leakage": "No"  # Default
+            }
+    return pd.DataFrame(list(model_info.values()))
+def get_merged_df(leaderboard_df: pd.DataFrame, model_info_df: pd.DataFrame) -> pd.DataFrame:
+    """Merge leaderboard and model info dataframes"""
+    if leaderboard_df.empty or model_info_df.empty:
+        return leaderboard_df
+    # Merge on model name
+    merged = pd.merge(leaderboard_df, model_info_df, on="model", how="left")
+    # Add rank column
+    if 'MAE' in merged.columns:
+        merged['Rank'] = merged['MAE'].rank(method='min').astype(int)
+        # Move Rank to front
+        cols = ['Rank'] + [col for col in merged.columns if col != 'Rank']
+        merged = merged[cols]
+    return merged
+def get_evaluation_queue_df(requests_path: str, eval_cols: List[str]) -> tuple:
+    """Get evaluation queue dataframes"""
+    # Return empty dataframes for now
+    return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

src/utils.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+Utility functions for MUSED-FM Leaderboard
+"""
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Any, Optional
+def norm_sNavie(df: pd.DataFrame) -> pd.DataFrame:
+    """Normalize dataframe using naive normalization"""
+    # Simple normalization - keep as is for now
+    return df
+def pivot_df(file_path: str, tab_name: str) -> pd.DataFrame:
+    """Pivot dataframe from file"""
+    try:
+        df = pd.read_csv(file_path)
+        # Simple pivot - return as is for now
+        return df
+    except Exception as e:
+        print(f"Error reading {file_path}: {e}")
+        return pd.DataFrame()
+def get_grouped_dfs() -> Dict[str, pd.DataFrame]:
+    """Get grouped dataframes for different views"""
+    from .load_results import load_results_with_metadata, create_overall_table
+    # Load results
+    results = load_results_with_metadata()
+    if not results:
+        return {
+            'domain': pd.DataFrame(),
+            'frequency': pd.DataFrame(),
+            'term_length': pd.DataFrame(),
+            'univariate': pd.DataFrame(),
+            'overall': pd.DataFrame()
+        }
+    # Create overall dataframe
+    overall_df = create_overall_table()
+    # For now, return the same dataframe for all views
+    # In a real implementation, these would be different aggregations
+    return {
+        'domain': overall_df.copy(),
+        'frequency': overall_df.copy(),
+        'term_length': overall_df.copy(),
+        'univariate': overall_df.copy(),
+        'overall': overall_df.copy()
+    }
+def pivot_existed_df(df: pd.DataFrame, tab_name: str) -> pd.DataFrame:
+    """Pivot existing dataframe"""
+    if df.empty:
+        return df
+    # Add tab name as a column for identification
+    df_copy = df.copy()
+    df_copy['tab'] = tab_name
+    return df_copy
+def rename_metrics(df: pd.DataFrame) -> pd.DataFrame:
+    """Rename metrics columns"""
+    if df.empty:
+        return df
+    # Add rank column based on MAE
+    if 'MAE' in df.columns:
+        df_copy = df.copy()
+        df_copy['MASE_Rank'] = df_copy['MAE'].rank(method='min')
+        return df_copy
+    return df
+def format_df(df: pd.DataFrame) -> pd.DataFrame:
+    """Format dataframe for display"""
+    if df.empty:
+        return df
+    df_copy = df.copy()
+    # Format numeric columns
+    numeric_cols = ['MAE', 'Uni-MAE', 'RMSE', 'MAPE', 'R²', 'SMAPE', 'Uni-Multi']
+    for col in numeric_cols:
+        if col in df_copy.columns:
+            if col in ['MAPE', 'SMAPE']:
+                df_copy[col] = df_copy[col].apply(lambda x: f"{x:.1f}%" if pd.notna(x) else "")
+            else:
+                df_copy[col] = df_copy[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "")
+    return df_copy