import gradio as gr import pandas as pd import json import os import glob from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS, LEADERBOARD_CSS from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub from utils_display import PhonemeEvalColumn, fields, make_clickable_model, styled_error, styled_message import numpy as np from datetime import datetime, timezone # from dotenv import load_dotenv # # Load environment variables from .env file # load_dotenv() # HF_TOKEN = os.environ.get("HF_TOKEN", None) LAST_UPDATED = "Oct 2nd 2025" # Global variable to store detailed benchmark data benchmark_details = {} # Directory for evaluation results EVAL_RESULTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "eval-results") column_names = { "model": "Model", "avg_per": "Average PER ⬇️", "avg_duration": "Avg Duration (s)", "per_phoneme_asr": "PER phoneme_asr", "per_kids_phoneme_md": "PER kids_phoneme_md", } def load_results(results_dir: str) -> pd.DataFrame: """Load results from JSON files in the results directory""" rows = [] all_dataset_keys = set() def round_two_decimals(value): try: if value is None: return None return round(float(value), 2) except Exception: return value if not os.path.isdir(results_dir): return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"]) # First pass: collect all dataset keys from all files for path in glob.glob(os.path.join(results_dir, "*.json")): try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) res = data.get("results", {}) all_dataset_keys.update(res.keys()) except Exception: continue # Use dataset keys directly as display names dataset_display_names = {key: key for key in all_dataset_keys} # Second pass: extract data for path in glob.glob(os.path.join(results_dir, "*.json")): try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) cfg = data.get("config", {}) res = data.get("results", {}) model_name = cfg.get("model_name", "unknown") # Extract PER for each dataset dynamically per_values = {} dur_values = [] for dataset_key in all_dataset_keys: dataset_data = res.get(dataset_key, {}) per_value = dataset_data.get("per") if dataset_data else None dur_value = dataset_data.get("avg_duration") if dataset_data else None display_name = dataset_display_names[dataset_key] per_values[f"{display_name}"] = round_two_decimals(per_value) if dur_value is not None: dur_values.append(dur_value) # Calculate average PER across all datasets per_vals = [v for v in per_values.values() if v is not None] avg_per = sum(per_vals) / len(per_vals) if per_vals else None avg_per = round_two_decimals(avg_per) # Calculate average duration avg_dur = sum(dur_values) / len(dur_values) if dur_values else None avg_dur = round_two_decimals(avg_dur) row = { "Model": make_clickable_model(model_name), "Average PER ⬇️": avg_per, "Avg Duration (s)": avg_dur, } row.update(per_values) rows.append(row) except Exception: continue df = pd.DataFrame(rows) if df.empty: # Create default columns based on discovered datasets default_cols = ["Model", "Average PER ⬇️", "Avg Duration (s)"] for key in sorted(all_dataset_keys): display_name = dataset_display_names[key] default_cols.insert(-2, f"PER {display_name}") return pd.DataFrame(columns=default_cols) df = df.sort_values(by=["Average PER ⬇️"], ascending=True, na_position="last") return df.reset_index(drop=True) # Load initial data try: # Support both legacy (3-tuple) and new (4-tuple) returns hub_info = load_all_info_from_dataset_hub() if isinstance(hub_info, tuple) and len(hub_info) >= 3: eval_queue_repo = hub_info[0] requested_models = hub_info[1] csv_results = hub_info[2] # Fourth value (if present) is not used in this app else: eval_queue_repo, requested_models, csv_results = None, None, None if eval_queue_repo is None or requested_models is None or csv_results is None: # No token provided, fallback to local results original_df = load_results(EVAL_RESULTS_DIR) elif csv_results and csv_results.exists(): original_df = pd.read_csv(csv_results) # Format the columns def formatter(x): if type(x) is str: x = x elif x == -1: x = "NA" else: x = round(x, 2) return x for col in original_df.columns: if col == "model": original_df[col] = original_df[col].apply(lambda x: make_clickable_model(x)) else: original_df[col] = original_df[col].apply(formatter) # Only rename columns that exist in the dataframe existing_columns = {k: v for k, v in column_names.items() if k in original_df.columns} original_df.rename(columns=existing_columns, inplace=True) if 'Average PER ⬇️' in original_df.columns: original_df.sort_values(by='Average PER ⬇️', inplace=True) else: # Fallback to local results original_df = load_results(EVAL_RESULTS_DIR) except Exception as e: print(f"Error loading data: {e}") # Fallback to local results original_df = load_results(EVAL_RESULTS_DIR) COLS = [c.name for c in fields(PhonemeEvalColumn)] TYPES = [c.type for c in fields(PhonemeEvalColumn)] def request_model(model_text, chb_phoneme_asr, chb_kids_phoneme_md): # Determine the selected checkboxes dataset_selection = [] if chb_phoneme_asr: dataset_selection.append("phoneme_asr") if chb_kids_phoneme_md: dataset_selection.append("kids_phoneme_md") if len(dataset_selection) == 0: return styled_error("You need to select at least one dataset") base_model_on_hub, error_msg = is_model_on_hub(model_text) if not base_model_on_hub: return styled_error(f"Base model '{model_text}' {error_msg}") # Construct the output dictionary current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") required_datasets = ', '.join(dataset_selection) eval_entry = { "date": current_time, "model": model_text, "datasets_selected": required_datasets } # Prepare file path DIR_OUTPUT_REQUESTS.mkdir(parents=True, exist_ok=True) fn_datasets = '@ '.join(dataset_selection) filename = model_text.replace("/","@") + "@@" + fn_datasets if requested_models and filename in requested_models: return styled_error(f"A request for this model '{model_text}' and dataset(s) was already made.") try: filename_ext = filename + ".txt" out_filepath = DIR_OUTPUT_REQUESTS / filename_ext # Write the results to a text file with open(out_filepath, "w") as f: f.write(json.dumps(eval_entry)) upload_file(filename, out_filepath) # Include file in the list of uploaded files if requested_models is not None: requested_models.append(filename) # Remove the local file out_filepath.unlink() return styled_message("🤗 Your request has been submitted and will be evaluated soon!

") except Exception as e: return styled_error(f"Error submitting request!") def filter_main_table(show_proprietary=True): filtered_df = original_df.copy() # Filter proprietary models if needed if not show_proprietary and "License" in filtered_df.columns: # Keep only models with "Open" license filtered_df = filtered_df[filtered_df["License"] == "Open"] return filtered_df def refresh_results(): """Refresh the results from the eval-results directory""" updated_df = load_results(EVAL_RESULTS_DIR) return updated_df with gr.Blocks(css=LEADERBOARD_CSS) as demo: # gr.HTML(BANNER, elem_id="banner") gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 Leaderboard", elem_id="phoneme-benchmark-tab-table", id=0): leaderboard_table = gr.components.Dataframe( value=original_df, datatype=TYPES, elem_id="leaderboard-table", interactive=False, visible=True, ) with gr.Row(): show_proprietary_checkbox = gr.Checkbox( label="Show proprietary models", value=True, elem_id="show-proprietary-checkbox" ) refresh_button = gr.Button("🔄 Refresh Results", variant="secondary") # Connect checkbox to the filtering function show_proprietary_checkbox.change( filter_main_table, inputs=[show_proprietary_checkbox], outputs=leaderboard_table ) # Connect refresh button refresh_button.click( refresh_results, outputs=leaderboard_table ) with gr.TabItem("📈 Metrics", elem_id="phoneme-benchmark-tab-table", id=1): gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text") with gr.TabItem("✉️✨ Request a model here!", elem_id="phoneme-benchmark-tab-table", id=2): with gr.Column(): gr.Markdown("# ✉️✨ Request results for a new model here!", elem_classes="markdown-text") with gr.Column(): gr.Markdown("Select datasets:", elem_classes="markdown-text") with gr.Column(): model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)") chb_phoneme_asr = gr.Checkbox(label="phoneme_asr dataset", value=True) chb_kids_phoneme_md = gr.Checkbox(label="kids_phoneme_md dataset", value=True) with gr.Column(): mdw_submission_result = gr.Markdown() btn_submitt = gr.Button(value="🚀 Request") btn_submitt.click(request_model, [model_name_textbox, chb_phoneme_asr, chb_kids_phoneme_md], mdw_submission_result) # add an about section with gr.TabItem("🤗 About", elem_id="phoneme-benchmark-tab-table", id=3): gr.Markdown("## About", elem_classes="markdown-text") gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): gr.Textbox( value=CITATION_TEXT, lines=7, label="Copy the BibTeX snippet to cite this source", elem_id="citation-button", show_copy_button=True, ) demo.launch()