Phoneme_Detection_Leaderboard

Running

App Files Files Community

lataon commited on Oct 2

Commit

f3ebaf3

1 Parent(s): cbe4946

use simple leaderboard

Browse files

Files changed (8) hide show

.gitignore +1 -1
app.py +129 -237
app_default.py +463 -0
simple_leaderboard.py → app_simple.py +214 -27
eval-results/results_1759289565_HuBERT-Base.json +17 -0
eval-results/results_1759289565_HuBERT-fine-tuned.json +17 -0
eval-results/results_1759289565_Timit.json +17 -0
src/about.py +4 -4

.gitignore CHANGED Viewed

@@ -7,7 +7,7 @@ __pycache__/
 .vscode/
 eval-queue/
-eval-results/
 eval-queue-bk/
 eval-results-bk/
 logs/

 .vscode/
 eval-queue/
+# eval-results/
 eval-queue-bk/
 eval-results-bk/
 logs/

app.py CHANGED Viewed

@@ -1,239 +1,131 @@
-import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
 import os
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    COLS,
-    AutoEvalColumn,
-    fields,
-)
-from src.about import Tasks
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation (prefer local JSONs, fall back to Hub)
-def _has_local_json(path: str) -> bool:
-    try:
-        return os.path.isdir(path) and any(str(f).endswith(".json") for f in os.listdir(path))
-    except Exception:
-        return False
-if not _has_local_json(EVAL_REQUESTS_PATH):
-    try:
-        print(EVAL_REQUESTS_PATH)
-        snapshot_download(
-            repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-        )
-    except Exception:
-        pass
-if not _has_local_json(EVAL_RESULTS_PATH):
-    try:
-        print(EVAL_RESULTS_PATH)
-        snapshot_download(
-            repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-        )
-    except Exception:
-        pass
-# Build benchmark and evaluation queue column metadata
-BENCHMARK_COLS = [f"{task.value.col_name} ({task.name})" for task in Tasks]
-EVAL_COLS = [
-    "Model",
-    "Model sha",
-    "status",
-    "precision",
-    "weight_type",
-    "model_type",
-    "likes",
-    "params",
-    "license",
-    "submitted_time",
-]
-EVAL_TYPES = [
-    "markdown",  # Model
-    "str",       # Model sha
-    "str",       # status
-    "str",       # precision
-    "str",       # weight_type
-    "str",       # model_type
-    "number",    # likes
-    "number",    # params
-    "str",       # license
-    "str",       # submitted_time
-]
-# Hide all models from the leaderboard view
-LEADERBOARD_DF = pd.DataFrame(columns=COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        dataframe = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)])
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-    )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 Phoneme Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=["Pretrained", "Fine-tuned", "Merge", "Other"],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=["float16", "bfloat16", "float32", "int8", "int4"],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=["Original", "Delta", "Adapter"],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
-    with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import os
+import glob
+import json
+import pandas as pd
+import gradio as gr
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+EVAL_RESULTS_DIR = os.path.join(ROOT_DIR, "eval-results")
+def load_results(results_dir: str) -> pd.DataFrame:
+    rows = []
+    all_dataset_keys = set()
+    if not os.path.isdir(results_dir):
+        return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
+    # First pass: collect all dataset keys from all files
+    for path in glob.glob(os.path.join(results_dir, "*.json")):
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            res = data.get("results", {})
+            all_dataset_keys.update(res.keys())
+        except Exception:
+            continue
+    # Use dataset keys directly as display names
+    dataset_display_names = {key: key for key in all_dataset_keys}
+    # Second pass: extract data
+    for path in glob.glob(os.path.join(results_dir, "*.json")):
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            cfg = data.get("config", {})
+            res = data.get("results", {})
+            model_name = cfg.get("model_name", "unknown")
+            # Extract PER for each dataset dynamically
+            per_values = {}
+            dur_values = []
+            for dataset_key in all_dataset_keys:
+                dataset_data = res.get(dataset_key, {})
+                per_value = dataset_data.get("per") if dataset_data else None
+                dur_value = dataset_data.get("avg_duration") if dataset_data else None
+                display_name = dataset_display_names[dataset_key]
+                per_values[f"PER {display_name}"] = per_value
+                if dur_value is not None:
+                    dur_values.append(dur_value)
+            # Calculate average PER across all datasets
+            per_vals = [v for v in per_values.values() if v is not None]
+            avg_per = sum(per_vals) / len(per_vals) if per_vals else None
+            # Calculate average duration
+            avg_dur = sum(dur_values) / len(dur_values) if dur_values else None
+            row = {
+                "Model": model_name,
+                "Avg PER": avg_per,
+                "Avg Duration (s)": avg_dur,
+                "_file": os.path.basename(path),
+            }
+            row.update(per_values)
+            rows.append(row)
+        except Exception:
+            continue
+    df = pd.DataFrame(rows)
+    if df.empty:
+        # Create default columns based on discovered datasets
+        default_cols = ["Model", "Avg PER", "Avg Duration (s)"]
+        for key in sorted(all_dataset_keys):
+            display_name = dataset_display_names[key]
+            default_cols.insert(-2, f"PER {display_name}")
+        return pd.DataFrame(columns=default_cols)
+    df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last")
+    return df.reset_index(drop=True)
+def build_interface():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Simple Phoneme Leaderboard")
+        info = gr.Markdown(f"Results directory: `{EVAL_RESULTS_DIR}`")
+        # Get initial data to determine columns dynamically
+        initial_df = load_results(EVAL_RESULTS_DIR)
+        if not initial_df.empty:
+            headers = list(initial_df.columns)
+            # Remove internal columns
+            headers = [h for h in headers if not h.startswith('_')]
+        else:
+            headers = ["Model", "Avg PER", "Avg Duration (s)"]
+        table = gr.Dataframe(headers=headers, row_count=5)
+        def refresh():
+            df = load_results(EVAL_RESULTS_DIR)
+            if df.empty:
+                return df
+            # Get the column order from the dataframe
+            cols = [c for c in df.columns if not c.startswith('_')]
+            # Ensure all columns exist for the dataframe component
+            for c in cols:
+                if c not in df.columns:
+                    df[c] = None
+            return df[cols].round(3)
+        btn = gr.Button("Refresh")
+        btn.click(fn=refresh, outputs=table)
+        # Auto-load on start
+        table.value = refresh()
+    return demo
+if __name__ == "__main__":
+    demo = build_interface()
+    demo.queue().launch()

app_default.py ADDED Viewed

	@@ -0,0 +1,463 @@

+import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+import os
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    COLS,
+    AutoEvalColumn,
+    fields,
+)
+from src.about import Tasks
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+# Import simple leaderboard functionality
+import glob
+import json
+from functools import lru_cache
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+### Space initialisation (prefer local JSONs, fall back to Hub)
+def _has_local_json(path: str) -> bool:
+    try:
+        return os.path.isdir(path) and any(str(f).endswith(".json") for f in os.listdir(path))
+    except Exception:
+        return False
+if not _has_local_json(EVAL_REQUESTS_PATH):
+    try:
+        print(EVAL_REQUESTS_PATH)
+        snapshot_download(
+            repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+        )
+    except Exception:
+        pass
+if not _has_local_json(EVAL_RESULTS_PATH):
+    try:
+        print(EVAL_RESULTS_PATH)
+        snapshot_download(
+            repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+        )
+    except Exception:
+        pass
+# Build benchmark and evaluation queue column metadata
+BENCHMARK_COLS = [f"{task.value.col_name} ({task.name})" for task in Tasks]
+EVAL_COLS = [
+    "Model",
+    "Model sha",
+    "status",
+    "precision",
+    "weight_type",
+    "model_type",
+    "likes",
+    "params",
+    "license",
+    "submitted_time",
+]
+EVAL_TYPES = [
+    "markdown",  # Model
+    "str",       # Model sha
+    "str",       # status
+    "str",       # precision
+    "str",       # weight_type
+    "str",       # model_type
+    "number",    # likes
+    "number",    # params
+    "str",       # license
+    "str",       # submitted_time
+]
+# Hide all models from the leaderboard view
+LEADERBOARD_DF = pd.DataFrame(columns=COLS)
+(
+    finished_eval_queue_df,
+    running_eval_queue_df,
+    pending_eval_queue_df,
+) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+@lru_cache(maxsize=1)
+def _get_simple_dataset_keys(results_dir: str) -> tuple:
+    """Cache dataset keys to avoid repeated file scanning."""
+    all_dataset_keys = set()
+    if not os.path.isdir(results_dir):
+        return tuple()
+    for path in glob.glob(os.path.join(results_dir, "*.json")):
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            res = data.get("results", {})
+            all_dataset_keys.update(res.keys())
+        except Exception:
+            continue
+    return tuple(sorted(all_dataset_keys))
+def load_simple_results(results_dir: str) -> pd.DataFrame:
+    """Load and process evaluation results from JSON files for simple leaderboard with caching."""
+    rows = []
+    all_dataset_keys = set(_get_simple_dataset_keys(results_dir))
+    if not all_dataset_keys:
+        return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
+    # Use dataset keys directly as display names
+    dataset_display_names = {key: key for key in all_dataset_keys}
+    # Single pass: extract data with optimized processing
+    for path in glob.glob(os.path.join(results_dir, "*.json")):
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            cfg = data.get("config", {})
+            res = data.get("results", {})
+            model_name = cfg.get("model_name", "unknown")
+            # Extract PER for each dataset dynamically
+            per_values = {}
+            dur_values = []
+            for dataset_key in all_dataset_keys:
+                dataset_data = res.get(dataset_key, {})
+                per_value = dataset_data.get("per") if dataset_data else None
+                dur_value = dataset_data.get("avg_duration") if dataset_data else None
+                display_name = dataset_display_names[dataset_key]
+                per_values[f"PER {display_name}"] = per_value
+                if dur_value is not None:
+                    dur_values.append(dur_value)
+            # Calculate average PER across all datasets
+            per_vals = [v for v in per_values.values() if v is not None]
+            avg_per = sum(per_vals) / len(per_vals) if per_vals else None
+            # Calculate average duration
+            avg_dur = sum(dur_values) / len(dur_values) if dur_values else None
+            row = {
+                "Model": model_name,
+                "Avg PER": avg_per,
+                "Avg Duration (s)": avg_dur,
+                "_file": os.path.basename(path),
+            }
+            row.update(per_values)
+            rows.append(row)
+        except Exception:
+            continue
+    df = pd.DataFrame(rows)
+    if df.empty:
+        # Create default columns based on discovered datasets
+        default_cols = ["Model", "Avg PER", "Avg Duration (s)"]
+        for key in sorted(all_dataset_keys):
+            display_name = dataset_display_names[key]
+            default_cols.insert(-2, f"PER {display_name}")
+        return pd.DataFrame(columns=default_cols)
+    df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last")
+    return df.reset_index(drop=True)
+def init_leaderboard(dataframe):
+    if dataframe is None or dataframe.empty:
+        dataframe = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)])
+    return Leaderboard(
+        value=dataframe,
+        datatype=[c.type for c in fields(AutoEvalColumn)],
+        select_columns=SelectColumns(
+            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
+            label="Select Columns to Display:",
+        ),
+        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
+        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+        filter_columns=[
+            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
+            ColumnFilter(
+                AutoEvalColumn.params.name,
+                type="slider",
+                min=0.01,
+                max=150,
+                label="Select the number of parameters (B)",
+            ),
+            ColumnFilter(
+                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
+            ),
+        ],
+        bool_checkboxgroup_label="Hide models",
+        interactive=False,
+    )
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 Phoneme Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(LEADERBOARD_DF)
+        with gr.TabItem("📊 Simple Results", elem_id="simple-results-tab", id=1):
+            gr.Markdown("## 🎯 Phoneme Detection Results")
+            gr.Markdown("Compare phoneme recognition models across different datasets")
+            # Stats section for simple results
+            with gr.Row():
+                simple_total_models = gr.HTML(
+                    '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>'
+                )
+                simple_best_per = gr.HTML(
+                    '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>'
+                )
+                simple_avg_duration = gr.HTML(
+                    '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>'
+                )
+            # Get initial data to determine columns dynamically
+            initial_df = load_simple_results(EVAL_RESULTS_PATH)
+            if not initial_df.empty:
+                headers = list(initial_df.columns)
+                # Remove internal columns
+                headers = [h for h in headers if not h.startswith('_')]
+            else:
+                headers = ["Model", "Avg PER", "Avg Duration (s)"]
+            with gr.Row():
+                with gr.Column(scale=4):
+                    simple_table = gr.Dataframe(
+                        headers=headers,
+                        row_count=10,
+                        label="🏆 Model Performance Leaderboard",
+                        interactive=False
+                    )
+                with gr.Column(scale=1):
+                    refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")
+                    # Export options
+                    with gr.Accordion("📥 Export Data", open=False):
+                        export_csv = gr.Button("📄 Export CSV", variant="secondary")
+                        export_json = gr.Button("📋 Export JSON", variant="secondary")
+            def refresh_simple():
+                """Refresh the simple leaderboard data with enhanced stats."""
+                df = load_simple_results(EVAL_RESULTS_PATH)
+                if df.empty:
+                    return df, "No data", "No data", "No data"
+                # Get the column order from the dataframe
+                cols = [c for c in df.columns if not c.startswith('_')]
+                # Ensure all columns exist for the dataframe component
+                for c in cols:
+                    if c not in df.columns:
+                        df[c] = None
+                # Calculate enhanced stats
+                total_models = len(df)
+                best_per_val = df['Avg PER'].min() if 'Avg PER' in df.columns and not df['Avg PER'].isna().all() else "N/A"
+                avg_duration_val = df['Avg Duration (s)'].mean() if 'Avg Duration (s)' in df.columns and not df['Avg Duration (s)'].isna().all() else "N/A"
+                # Format stats
+                best_per_str = f"{best_per_val:.2f}" if isinstance(best_per_val, (int, float)) else str(best_per_val)
+                avg_duration_str = f"{avg_duration_val:.2f}s" if isinstance(avg_duration_val, (int, float)) else str(avg_duration_val)
+                return (
+                    df[cols].round(3),
+                    f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{total_models}</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>',
+                    f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{best_per_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>',
+                    f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{avg_duration_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>'
+                )
+            def export_simple_csv():
+                """Export simple results as CSV."""
+                df = load_simple_results(EVAL_RESULTS_PATH)
+                if df.empty:
+                    return None
+                cols = [c for c in df.columns if not c.startswith('_')]
+                return df[cols].round(3)
+            def export_simple_json():
+                """Export simple results as JSON."""
+                df = load_simple_results(EVAL_RESULTS_PATH)
+                if df.empty:
+                    return None
+                cols = [c for c in df.columns if not c.startswith('_')]
+                return df[cols].round(3).to_json(orient='records', indent=2)
+            # Connect events
+            refresh_btn.click(
+                fn=refresh_simple,
+                outputs=[simple_table, simple_total_models, simple_best_per, simple_avg_duration]
+            )
+            export_csv.click(
+                fn=export_simple_csv,
+                outputs=gr.File(label="Download CSV")
+            )
+            export_json.click(
+                fn=export_simple_json,
+                outputs=gr.File(label="Download JSON")
+            )
+            # Auto-load on start
+            simple_table.value, simple_total_models.value, simple_best_per.value, simple_avg_duration.value = refresh_simple()
+            # Enhanced help section
+            with gr.Accordion("ℹ️ About this Leaderboard", open=False):
+                gr.Markdown("""
+                ## 📊 Understanding the Results
+                **Performance Metrics:**
+                - **PER (Phoneme Error Rate)**: Lower values indicate better performance
+                - **Avg Duration**: Processing time per sample (lower is faster)
+                - **Models are ranked by average PER across all datasets**
+                **Datasets Evaluated:**
+                - `phoneme_asr`: General phoneme recognition dataset
+                - `kids_phoneme_md`: Kids' phoneme recognition dataset
+                **How to Interpret:**
+                - **PER**: Percentage of phonemes incorrectly recognized (0% = perfect)
+                - **Duration**: Time efficiency (important for real-time applications)
+                - **Average PER**: Overall model performance across all datasets
+                **Tips for Model Selection:**
+                - Choose models with low PER for accuracy-critical applications
+                - Consider duration for real-time or resource-constrained environments
+                - Balance between accuracy (PER) and speed (Duration) based on your needs
+                """)
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Column():
+                    with gr.Accordion(
+                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                    model_type = gr.Dropdown(
+                        choices=["Pretrained", "Fine-tuned", "Merge", "Other"],
+                        label="Model type",
+                        multiselect=False,
+                        value=None,
+                        interactive=True,
+                    )
+                with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=["float16", "bfloat16", "float32", "int8", "int4"],
+                        label="Precision",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True,
+                    )
+                    weight_type = gr.Dropdown(
+                        choices=["Original", "Delta", "Adapter"],
+                        label="Weights type",
+                        multiselect=False,
+                        value="Original",
+                        interactive=True,
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            submit_button = gr.Button("Submit Eval")
+            submission_result = gr.Markdown()
+            submit_button.click(
+                add_new_eval,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    weight_type,
+                    model_type,
+                ],
+                submission_result,
+            )
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=20,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

simple_leaderboard.py → app_simple.py RENAMED Viewed

@@ -3,20 +3,21 @@ import glob
 import json
 import pandas as pd
 import gradio as gr
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 EVAL_RESULTS_DIR = os.path.join(ROOT_DIR, "eval-results")
-def load_results(results_dir: str) -> pd.DataFrame:
-    rows = []
     all_dataset_keys = set()
     if not os.path.isdir(results_dir):
-        return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
-    # First pass: collect all dataset keys from all files
     for path in glob.glob(os.path.join(results_dir, "*.json")):
         try:
             with open(path, "r", encoding="utf-8") as f:
@@ -25,11 +26,24 @@ def load_results(results_dir: str) -> pd.DataFrame:
             all_dataset_keys.update(res.keys())
         except Exception:
             continue
     # Use dataset keys directly as display names
     dataset_display_names = {key: key for key in all_dataset_keys}
-    # Second pass: extract data
     for path in glob.glob(os.path.join(results_dir, "*.json")):
         try:
             with open(path, "r", encoding="utf-8") as f:
@@ -87,25 +101,124 @@ def load_results(results_dir: str) -> pd.DataFrame:
 def build_interface():
-    with gr.Blocks() as demo:
-        gr.Markdown("# Simple Phoneme Leaderboard")
-        info = gr.Markdown(f"Results directory: `{EVAL_RESULTS_DIR}`")
-        # Get initial data to determine columns dynamically
-        initial_df = load_results(EVAL_RESULTS_DIR)
-        if not initial_df.empty:
-            headers = list(initial_df.columns)
-            # Remove internal columns
-            headers = [h for h in headers if not h.startswith('_')]
-        else:
-            headers = ["Model", "Avg PER", "Avg Duration (s)"]
-        table = gr.Dataframe(headers=headers, row_count=5)
         def refresh():
             df = load_results(EVAL_RESULTS_DIR)
             if df.empty:
-                return df
             # Get the column order from the dataframe
             cols = [c for c in df.columns if not c.startswith('_')]
@@ -114,18 +227,92 @@ def build_interface():
             for c in cols:
                 if c not in df.columns:
                     df[c] = None
             return df[cols].round(3)
-        btn = gr.Button("Refresh")
-        btn.click(fn=refresh, outputs=table)
         # Auto-load on start
-        table.value = refresh()
     return demo
 if __name__ == "__main__":
     demo = build_interface()
-    demo.queue().launch()

 import json
 import pandas as pd
 import gradio as gr
+from typing import Optional, Dict, List
+import time
+from functools import lru_cache
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 EVAL_RESULTS_DIR = os.path.join(ROOT_DIR, "eval-results")
+@lru_cache(maxsize=1)
+def _get_dataset_keys(results_dir: str) -> tuple:
+    """Cache dataset keys to avoid repeated file scanning."""
     all_dataset_keys = set()
     if not os.path.isdir(results_dir):
+        return tuple()
     for path in glob.glob(os.path.join(results_dir, "*.json")):
         try:
             with open(path, "r", encoding="utf-8") as f:
             all_dataset_keys.update(res.keys())
         except Exception:
             continue
+    return tuple(sorted(all_dataset_keys))
+def load_results(results_dir: str) -> pd.DataFrame:
+    """
+    Load and process evaluation results from JSON files.
+    Dynamically handles any number of datasets with caching for performance.
+    """
+    rows = []
+    all_dataset_keys = set(_get_dataset_keys(results_dir))
+    if not all_dataset_keys:
+        return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
     # Use dataset keys directly as display names
     dataset_display_names = {key: key for key in all_dataset_keys}
+    # Single pass: extract data with optimized processing
     for path in glob.glob(os.path.join(results_dir, "*.json")):
         try:
             with open(path, "r", encoding="utf-8") as f:
 def build_interface():
+    """Build the optimized Gradio interface for the phoneme leaderboard."""
+    # Custom CSS for better styling
+    custom_css = """
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: 0 auto !important;
+    }
+    .leaderboard-header {
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .stats-container {
+        display: flex;
+        gap: 1rem;
+        margin-bottom: 1rem;
+        flex-wrap: wrap;
+    }
+    .stat-card {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        padding: 1rem;
+        border-radius: 10px;
+        text-align: center;
+        min-width: 150px;
+        flex: 1;
+    }
+    .stat-value {
+        font-size: 1.5rem;
+        font-weight: bold;
+        margin-bottom: 0.5rem;
+    }
+    .stat-label {
+        font-size: 0.9rem;
+        opacity: 0.9;
+    }
+    .table-container {
+        margin-top: 1rem;
+    }
+    .refresh-btn {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        border: none;
+        padding: 0.5rem 1rem;
+        border-radius: 5px;
+        cursor: pointer;
+    }
+    """
+    with gr.Blocks(
+        title="Phoneme Detection Leaderboard",
+        css=custom_css,
+        theme=gr.themes.Soft()
+    ) as demo:
+        # Header section
+        with gr.Column(elem_classes="leaderboard-header"):
+            gr.Markdown("# 🎯 Phoneme Detection Leaderboard")
+            gr.Markdown("Compare phoneme recognition models across different datasets")
+        # Stats section
+        with gr.Row(elem_classes="stats-container"):
+            total_models = gr.HTML(
+                '<div class="stat-card"><div class="stat-value" id="total-models">-</div><div class="stat-label">Total Models</div></div>',
+                elem_id="total-models-card"
+            )
+            best_per = gr.HTML(
+                '<div class="stat-card"><div class="stat-value" id="best-per">-</div><div class="stat-label">Best PER</div></div>',
+                elem_id="best-per-card"
+            )
+            avg_duration = gr.HTML(
+                '<div class="stat-card"><div class="stat-value" id="avg-duration">-</div><div class="stat-label">Avg Duration</div></div>',
+                elem_id="avg-duration-card"
+            )
+        # Main content
+        with gr.Row():
+            with gr.Column(scale=4):
+                # Get initial data to determine columns dynamically
+                initial_df = load_results(EVAL_RESULTS_DIR)
+                if not initial_df.empty:
+                    headers = list(initial_df.columns)
+                    # Remove internal columns
+                    headers = [h for h in headers if not h.startswith('_')]
+                else:
+                    headers = ["Model", "Avg PER", "Avg Duration (s)"]
+                table = gr.Dataframe(
+                    headers=headers,
+                    row_count=10,
+                    label="🏆 Model Performance Leaderboard",
+                    interactive=False,
+                    elem_classes="table-container"
+                )
+            with gr.Column(scale=1):
+                refresh_btn = gr.Button(
+                    "🔄 Refresh Data",
+                    variant="primary",
+                    elem_classes="refresh-btn"
+                )
+                # Quick stats
+                with gr.Accordion("📊 Quick Stats", open=True):
+                    stats_display = gr.HTML("Loading statistics...")
+                # Export options
+                with gr.Accordion("📥 Export Data", open=False):
+                    export_csv = gr.Button("📄 Export as CSV", variant="secondary")
+                    export_json = gr.Button("📋 Export as JSON", variant="secondary")
         def refresh():
+            """Refresh the leaderboard data with performance optimization."""
+            start_time = time.time()
             df = load_results(EVAL_RESULTS_DIR)
             if df.empty:
+                return df, "No data available", "No data available", "No data available"
             # Get the column order from the dataframe
             cols = [c for c in df.columns if not c.startswith('_')]
             for c in cols:
                 if c not in df.columns:
                     df[c] = None
+            # Calculate stats
+            total_models = len(df)
+            best_per_val = df['Avg PER'].min() if 'Avg PER' in df.columns and not df['Avg PER'].isna().all() else "N/A"
+            avg_duration_val = df['Avg Duration (s)'].mean() if 'Avg Duration (s)' in df.columns and not df['Avg Duration (s)'].isna().all() else "N/A"
+            # Format stats
+            best_per_str = f"{best_per_val:.2f}" if isinstance(best_per_val, (int, float)) else str(best_per_val)
+            avg_duration_str = f"{avg_duration_val:.2f}s" if isinstance(avg_duration_val, (int, float)) else str(avg_duration_val)
+            load_time = time.time() - start_time
+            return (
+                df[cols].round(3),
+                f"<div class='stat-card'><div class='stat-value'>{total_models}</div><div class='stat-label'>Total Models</div></div>",
+                f"<div class='stat-card'><div class='stat-value'>{best_per_str}</div><div class='stat-label'>Best PER</div></div>",
+                f"<div class='stat-card'><div class='stat-value'>{avg_duration_str}</div><div class='stat-label'>Avg Duration</div></div>"
+            )
+        def export_csv_data():
+            """Export data as CSV."""
+            df = load_results(EVAL_RESULTS_DIR)
+            if df.empty:
+                return None
+            cols = [c for c in df.columns if not c.startswith('_')]
             return df[cols].round(3)
+        def export_json_data():
+            """Export data as JSON."""
+            df = load_results(EVAL_RESULTS_DIR)
+            if df.empty:
+                return None
+            cols = [c for c in df.columns if not c.startswith('_')]
+            return df[cols].round(3).to_json(orient='records', indent=2)
+        # Connect events
+        refresh_btn.click(
+            fn=refresh,
+            outputs=[table, total_models, best_per, avg_duration]
+        )
+        export_csv.click(
+            fn=export_csv_data,
+            outputs=gr.File(label="Download CSV")
+        )
+        export_json.click(
+            fn=export_json_data,
+            outputs=gr.File(label="Download JSON")
+        )
         # Auto-load on start
+        table.value, total_models.value, best_per.value, avg_duration.value = refresh()
+        # Help section
+        with gr.Accordion("ℹ️ About this Leaderboard", open=False):
+            gr.Markdown("""
+            ## 📊 Understanding the Results
+            **Performance Metrics:**
+            - **PER (Phoneme Error Rate)**: Lower values indicate better performance
+            - **Avg Duration**: Processing time per sample (lower is faster)
+            - **Models are ranked by average PER across all datasets**
+            **Datasets Evaluated:**
+            - `phoneme_asr`: General phoneme recognition dataset
+            - `kids_phoneme_md`: Kids' phoneme recognition dataset
+            **How to Interpret:**
+            - **PER**: Percentage of phonemes incorrectly recognized (0% = perfect)
+            - **Duration**: Time efficiency (important for real-time applications)
+            - **Average PER**: Overall model performance across all datasets
+            **Tips for Model Selection:**
+            - Choose models with low PER for accuracy-critical applications
+            - Consider duration for real-time or resource-constrained environments
+            - Balance between accuracy (PER) and speed (Duration) based on your needs
+            """)
     return demo
 if __name__ == "__main__":
     demo = build_interface()
+    demo.queue().launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )

eval-results/results_1759289565_HuBERT-Base.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "config": {
+    "model_name": "local/HuBERT-Base",
+    "model_dtype": "float32",
+    "model_sha": ""
+  },
+  "results": {
+    "phoneme_asr": {
+      "per": 79.85359813133437,
+      "avg_duration": 0.5645037651062011
+    },
+    "kids_phoneme_md": {
+      "per": 71.85295670319688,
+      "avg_duration": 1.0543905973434449
+    }
+  }
+}

eval-results/results_1759289565_HuBERT-fine-tuned.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "config": {
+    "model_name": "local/HuBERT-fine-tuned",
+    "model_dtype": "float32",
+    "model_sha": ""
+  },
+  "results": {
+    "phoneme_asr": {
+      "per": 2.774112645808511,
+      "avg_duration": 0.5711040496826172
+    },
+    "kids_phoneme_md": {
+      "per": 12.210125572986708,
+      "avg_duration": 1.0601478815078735
+    }
+  }
+}

eval-results/results_1759289565_Timit.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "config": {
+    "model_name": "local/Timit",
+    "model_dtype": "float32",
+    "model_sha": ""
+  },
+  "results": {
+    "phoneme_asr": {
+      "per": 36.477283094931195,
+      "avg_duration": 0.554583740234375
+    },
+    "kids_phoneme_md": {
+      "per": 40.59831492610759,
+      "avg_duration": 1.0818484544754028
+    }
+  }
+}

src/about.py CHANGED Viewed

@@ -12,9 +12,9 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the results json, metric_key, column name for display
-    # Replace with your phoneme metrics and datasets
-    phoneme_dev = Task("phoneme_dev", "per", "Phoneme Dev PER")
-    phoneme_test = Task("phoneme_test", "per", "Phoneme Test PER")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
@@ -27,7 +27,7 @@ TITLE = """<h1 align="center" id="space-title">Phoneme Detection Leaderboard</h1
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 This leaderboard ranks phoneme detection models by average PER (lower is better).
-Evaluations aggregate across dev/test splits for a fair comparison.
 """
 # Which evaluations are you running? how can people reproduce what you have?

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the results json, metric_key, column name for display
+    # Using actual dataset names as keys
+    phoneme_asr = Task("phoneme_asr", "per", "PER phoneme_asr")
+    kids_phoneme_md = Task("kids_phoneme_md", "per", "PER kids_phoneme_md")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 This leaderboard ranks phoneme detection models by average PER (lower is better).
+Evaluations aggregate across phoneme_asr and kids_phoneme_md datasets for a fair comparison.
 """
 # Which evaluations are you running? how can people reproduce what you have?