|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import json |
|
|
import os |
|
|
import glob |
|
|
from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS, LEADERBOARD_CSS |
|
|
from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub |
|
|
from utils_display import PhonemeEvalColumn, fields, make_clickable_model, styled_error, styled_message |
|
|
import numpy as np |
|
|
from datetime import datetime, timezone |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LAST_UPDATED = "Oct 2nd 2025" |
|
|
|
|
|
|
|
|
benchmark_details = {} |
|
|
|
|
|
|
|
|
EVAL_RESULTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "eval-results") |
|
|
|
|
|
column_names = { |
|
|
"model": "Model", |
|
|
"avg_per": "Average PER β¬οΈ", |
|
|
"avg_duration": "Avg Duration (s)", |
|
|
"per_phoneme_asr": "PER phoneme_asr", |
|
|
"per_kids_phoneme_md": "PER kids_phoneme_md", |
|
|
} |
|
|
|
|
|
def load_results(results_dir: str) -> pd.DataFrame: |
|
|
"""Load results from JSON files in the results directory""" |
|
|
rows = [] |
|
|
all_dataset_keys = set() |
|
|
|
|
|
def round_two_decimals(value): |
|
|
try: |
|
|
if value is None: |
|
|
return None |
|
|
return round(float(value), 2) |
|
|
except Exception: |
|
|
return value |
|
|
|
|
|
if not os.path.isdir(results_dir): |
|
|
return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"]) |
|
|
|
|
|
|
|
|
for path in glob.glob(os.path.join(results_dir, "*.json")): |
|
|
try: |
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
res = data.get("results", {}) |
|
|
all_dataset_keys.update(res.keys()) |
|
|
except Exception: |
|
|
continue |
|
|
|
|
|
|
|
|
dataset_display_names = {key: key for key in all_dataset_keys} |
|
|
|
|
|
|
|
|
for path in glob.glob(os.path.join(results_dir, "*.json")): |
|
|
try: |
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
cfg = data.get("config", {}) |
|
|
res = data.get("results", {}) |
|
|
|
|
|
model_name = cfg.get("model_name", "unknown") |
|
|
|
|
|
|
|
|
per_values = {} |
|
|
dur_values = [] |
|
|
|
|
|
for dataset_key in all_dataset_keys: |
|
|
dataset_data = res.get(dataset_key, {}) |
|
|
per_value = dataset_data.get("per") if dataset_data else None |
|
|
dur_value = dataset_data.get("avg_duration") if dataset_data else None |
|
|
|
|
|
display_name = dataset_display_names[dataset_key] |
|
|
per_values[f"{display_name}"] = round_two_decimals(per_value) |
|
|
|
|
|
if dur_value is not None: |
|
|
dur_values.append(dur_value) |
|
|
|
|
|
|
|
|
per_vals = [v for v in per_values.values() if v is not None] |
|
|
avg_per = sum(per_vals) / len(per_vals) if per_vals else None |
|
|
avg_per = round_two_decimals(avg_per) |
|
|
|
|
|
|
|
|
avg_dur = sum(dur_values) / len(dur_values) if dur_values else None |
|
|
avg_dur = round_two_decimals(avg_dur) |
|
|
|
|
|
row = { |
|
|
"Model": make_clickable_model(model_name), |
|
|
"Average PER β¬οΈ": avg_per, |
|
|
"Avg Duration (s)": avg_dur, |
|
|
} |
|
|
row.update(per_values) |
|
|
rows.append(row) |
|
|
|
|
|
except Exception: |
|
|
continue |
|
|
|
|
|
df = pd.DataFrame(rows) |
|
|
if df.empty: |
|
|
|
|
|
default_cols = ["Model", "Average PER β¬οΈ", "Avg Duration (s)"] |
|
|
for key in sorted(all_dataset_keys): |
|
|
display_name = dataset_display_names[key] |
|
|
default_cols.insert(-2, f"PER {display_name}") |
|
|
return pd.DataFrame(columns=default_cols) |
|
|
|
|
|
df = df.sort_values(by=["Average PER β¬οΈ"], ascending=True, na_position="last") |
|
|
return df.reset_index(drop=True) |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
hub_info = load_all_info_from_dataset_hub() |
|
|
if isinstance(hub_info, tuple) and len(hub_info) >= 3: |
|
|
eval_queue_repo = hub_info[0] |
|
|
requested_models = hub_info[1] |
|
|
csv_results = hub_info[2] |
|
|
|
|
|
else: |
|
|
eval_queue_repo, requested_models, csv_results = None, None, None |
|
|
if eval_queue_repo is None or requested_models is None or csv_results is None: |
|
|
|
|
|
original_df = load_results(EVAL_RESULTS_DIR) |
|
|
elif csv_results and csv_results.exists(): |
|
|
original_df = pd.read_csv(csv_results) |
|
|
|
|
|
def formatter(x): |
|
|
if type(x) is str: |
|
|
x = x |
|
|
elif x == -1: |
|
|
x = "NA" |
|
|
else: |
|
|
x = round(x, 2) |
|
|
return x |
|
|
|
|
|
for col in original_df.columns: |
|
|
if col == "model": |
|
|
original_df[col] = original_df[col].apply(lambda x: make_clickable_model(x)) |
|
|
else: |
|
|
original_df[col] = original_df[col].apply(formatter) |
|
|
|
|
|
existing_columns = {k: v for k, v in column_names.items() if k in original_df.columns} |
|
|
original_df.rename(columns=existing_columns, inplace=True) |
|
|
if 'Average PER β¬οΈ' in original_df.columns: |
|
|
original_df.sort_values(by='Average PER β¬οΈ', inplace=True) |
|
|
else: |
|
|
|
|
|
original_df = load_results(EVAL_RESULTS_DIR) |
|
|
except Exception as e: |
|
|
print(f"Error loading data: {e}") |
|
|
|
|
|
original_df = load_results(EVAL_RESULTS_DIR) |
|
|
|
|
|
COLS = [c.name for c in fields(PhonemeEvalColumn)] |
|
|
TYPES = [c.type for c in fields(PhonemeEvalColumn)] |
|
|
|
|
|
def request_model(model_text, chb_phoneme_asr, chb_kids_phoneme_md): |
|
|
|
|
|
|
|
|
dataset_selection = [] |
|
|
if chb_phoneme_asr: |
|
|
dataset_selection.append("phoneme_asr") |
|
|
if chb_kids_phoneme_md: |
|
|
dataset_selection.append("kids_phoneme_md") |
|
|
|
|
|
if len(dataset_selection) == 0: |
|
|
return styled_error("You need to select at least one dataset") |
|
|
|
|
|
base_model_on_hub, error_msg = is_model_on_hub(model_text) |
|
|
|
|
|
if not base_model_on_hub: |
|
|
return styled_error(f"Base model '{model_text}' {error_msg}") |
|
|
|
|
|
|
|
|
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") |
|
|
required_datasets = ', '.join(dataset_selection) |
|
|
eval_entry = { |
|
|
"date": current_time, |
|
|
"model": model_text, |
|
|
"datasets_selected": required_datasets |
|
|
} |
|
|
|
|
|
|
|
|
DIR_OUTPUT_REQUESTS.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
fn_datasets = '@ '.join(dataset_selection) |
|
|
filename = model_text.replace("/","@") + "@@" + fn_datasets |
|
|
if requested_models and filename in requested_models: |
|
|
return styled_error(f"A request for this model '{model_text}' and dataset(s) was already made.") |
|
|
try: |
|
|
filename_ext = filename + ".txt" |
|
|
out_filepath = DIR_OUTPUT_REQUESTS / filename_ext |
|
|
|
|
|
|
|
|
with open(out_filepath, "w") as f: |
|
|
f.write(json.dumps(eval_entry)) |
|
|
|
|
|
upload_file(filename, out_filepath) |
|
|
|
|
|
|
|
|
if requested_models is not None: |
|
|
requested_models.append(filename) |
|
|
|
|
|
|
|
|
out_filepath.unlink() |
|
|
|
|
|
return styled_message("π€ Your request has been submitted and will be evaluated soon!</p>") |
|
|
except Exception as e: |
|
|
return styled_error(f"Error submitting request!") |
|
|
|
|
|
def filter_main_table(show_proprietary=True): |
|
|
filtered_df = original_df.copy() |
|
|
|
|
|
|
|
|
if not show_proprietary and "License" in filtered_df.columns: |
|
|
|
|
|
filtered_df = filtered_df[filtered_df["License"] == "Open"] |
|
|
|
|
|
return filtered_df |
|
|
|
|
|
def refresh_results(): |
|
|
"""Refresh the results from the eval-results directory""" |
|
|
updated_df = load_results(EVAL_RESULTS_DIR) |
|
|
return updated_df |
|
|
|
|
|
with gr.Blocks(css=LEADERBOARD_CSS) as demo: |
|
|
|
|
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
|
with gr.TabItem("π
Leaderboard", elem_id="phoneme-benchmark-tab-table", id=0): |
|
|
leaderboard_table = gr.components.Dataframe( |
|
|
value=original_df, |
|
|
datatype=TYPES, |
|
|
elem_id="leaderboard-table", |
|
|
interactive=False, |
|
|
visible=True, |
|
|
) |
|
|
with gr.Row(): |
|
|
show_proprietary_checkbox = gr.Checkbox( |
|
|
label="Show proprietary models", |
|
|
value=True, |
|
|
elem_id="show-proprietary-checkbox" |
|
|
) |
|
|
refresh_button = gr.Button("π Refresh Results", variant="secondary") |
|
|
|
|
|
|
|
|
show_proprietary_checkbox.change( |
|
|
filter_main_table, |
|
|
inputs=[show_proprietary_checkbox], |
|
|
outputs=leaderboard_table |
|
|
) |
|
|
|
|
|
|
|
|
refresh_button.click( |
|
|
refresh_results, |
|
|
outputs=leaderboard_table |
|
|
) |
|
|
|
|
|
with gr.TabItem("π Metrics", elem_id="phoneme-benchmark-tab-table", id=1): |
|
|
gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text") |
|
|
|
|
|
with gr.TabItem("βοΈβ¨ Request a model here!", elem_id="phoneme-benchmark-tab-table", id=2): |
|
|
with gr.Column(): |
|
|
gr.Markdown("# βοΈβ¨ Request results for a new model here!", elem_classes="markdown-text") |
|
|
with gr.Column(): |
|
|
gr.Markdown("Select datasets:", elem_classes="markdown-text") |
|
|
with gr.Column(): |
|
|
model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)") |
|
|
chb_phoneme_asr = gr.Checkbox(label="phoneme_asr dataset", value=True) |
|
|
chb_kids_phoneme_md = gr.Checkbox(label="kids_phoneme_md dataset", value=True) |
|
|
with gr.Column(): |
|
|
mdw_submission_result = gr.Markdown() |
|
|
btn_submitt = gr.Button(value="π Request") |
|
|
btn_submitt.click(request_model, |
|
|
[model_name_textbox, chb_phoneme_asr, chb_kids_phoneme_md], |
|
|
mdw_submission_result) |
|
|
|
|
|
with gr.TabItem("π€ About", elem_id="phoneme-benchmark-tab-table", id=3): |
|
|
gr.Markdown("## About", elem_classes="markdown-text") |
|
|
|
|
|
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Accordion("π Citation", open=False): |
|
|
gr.Textbox( |
|
|
value=CITATION_TEXT, lines=7, |
|
|
label="Copy the BibTeX snippet to cite this source", |
|
|
elem_id="citation-button", |
|
|
show_copy_button=True, |
|
|
) |
|
|
|
|
|
demo.launch() |