lataon commited on
Commit
f3ebaf3
·
1 Parent(s): cbe4946

use simple leaderboard

Browse files
.gitignore CHANGED
@@ -7,7 +7,7 @@ __pycache__/
7
  .vscode/
8
 
9
  eval-queue/
10
- eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
 
7
  .vscode/
8
 
9
  eval-queue/
10
+ # eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
app.py CHANGED
@@ -1,239 +1,131 @@
1
- import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- from src.about import (
9
- CITATION_BUTTON_LABEL,
10
- CITATION_BUTTON_TEXT,
11
- EVALUATION_QUEUE_TEXT,
12
- INTRODUCTION_TEXT,
13
- LLM_BENCHMARKS_TEXT,
14
- TITLE,
15
- )
16
- from src.display.css_html_js import custom_css
17
- from src.display.utils import (
18
- COLS,
19
- AutoEvalColumn,
20
- fields,
21
- )
22
- from src.about import Tasks
23
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
24
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
25
- from src.submission.submit import add_new_eval
26
-
27
-
28
- def restart_space():
29
- API.restart_space(repo_id=REPO_ID)
30
-
31
- ### Space initialisation (prefer local JSONs, fall back to Hub)
32
- def _has_local_json(path: str) -> bool:
33
- try:
34
- return os.path.isdir(path) and any(str(f).endswith(".json") for f in os.listdir(path))
35
- except Exception:
36
- return False
37
-
38
- if not _has_local_json(EVAL_REQUESTS_PATH):
39
- try:
40
- print(EVAL_REQUESTS_PATH)
41
- snapshot_download(
42
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
43
- )
44
- except Exception:
45
- pass
46
-
47
- if not _has_local_json(EVAL_RESULTS_PATH):
48
- try:
49
- print(EVAL_RESULTS_PATH)
50
- snapshot_download(
51
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
52
- )
53
- except Exception:
54
- pass
55
-
56
-
57
- # Build benchmark and evaluation queue column metadata
58
- BENCHMARK_COLS = [f"{task.value.col_name} ({task.name})" for task in Tasks]
59
-
60
- EVAL_COLS = [
61
- "Model",
62
- "Model sha",
63
- "status",
64
- "precision",
65
- "weight_type",
66
- "model_type",
67
- "likes",
68
- "params",
69
- "license",
70
- "submitted_time",
71
- ]
72
-
73
- EVAL_TYPES = [
74
- "markdown", # Model
75
- "str", # Model sha
76
- "str", # status
77
- "str", # precision
78
- "str", # weight_type
79
- "str", # model_type
80
- "number", # likes
81
- "number", # params
82
- "str", # license
83
- "str", # submitted_time
84
- ]
85
-
86
- # Hide all models from the leaderboard view
87
- LEADERBOARD_DF = pd.DataFrame(columns=COLS)
88
-
89
- (
90
- finished_eval_queue_df,
91
- running_eval_queue_df,
92
- pending_eval_queue_df,
93
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
94
-
95
- def init_leaderboard(dataframe):
96
- if dataframe is None or dataframe.empty:
97
- dataframe = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)])
98
- return Leaderboard(
99
- value=dataframe,
100
- datatype=[c.type for c in fields(AutoEvalColumn)],
101
- select_columns=SelectColumns(
102
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
103
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
104
- label="Select Columns to Display:",
105
- ),
106
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
107
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
108
- filter_columns=[
109
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
110
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
111
- ColumnFilter(
112
- AutoEvalColumn.params.name,
113
- type="slider",
114
- min=0.01,
115
- max=150,
116
- label="Select the number of parameters (B)",
117
- ),
118
- ColumnFilter(
119
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
120
- ),
121
- ],
122
- bool_checkboxgroup_label="Hide models",
123
- interactive=False,
124
- )
125
-
126
-
127
- demo = gr.Blocks(css=custom_css)
128
- with demo:
129
- gr.HTML(TITLE)
130
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
131
-
132
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
133
- with gr.TabItem("🏅 Phoneme Benchmark", elem_id="llm-benchmark-tab-table", id=0):
134
- leaderboard = init_leaderboard(LEADERBOARD_DF)
135
-
136
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
137
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
138
-
139
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
140
- with gr.Column():
141
- with gr.Row():
142
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
143
-
144
- with gr.Column():
145
- with gr.Accordion(
146
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
147
- open=False,
148
- ):
149
- with gr.Row():
150
- finished_eval_table = gr.components.Dataframe(
151
- value=finished_eval_queue_df,
152
- headers=EVAL_COLS,
153
- datatype=EVAL_TYPES,
154
- row_count=5,
155
- )
156
- with gr.Accordion(
157
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
158
- open=False,
159
- ):
160
- with gr.Row():
161
- running_eval_table = gr.components.Dataframe(
162
- value=running_eval_queue_df,
163
- headers=EVAL_COLS,
164
- datatype=EVAL_TYPES,
165
- row_count=5,
166
- )
167
-
168
- with gr.Accordion(
169
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
170
- open=False,
171
- ):
172
- with gr.Row():
173
- pending_eval_table = gr.components.Dataframe(
174
- value=pending_eval_queue_df,
175
- headers=EVAL_COLS,
176
- datatype=EVAL_TYPES,
177
- row_count=5,
178
- )
179
- with gr.Row():
180
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
181
-
182
- with gr.Row():
183
- with gr.Column():
184
- model_name_textbox = gr.Textbox(label="Model name")
185
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
186
- model_type = gr.Dropdown(
187
- choices=["Pretrained", "Fine-tuned", "Merge", "Other"],
188
- label="Model type",
189
- multiselect=False,
190
- value=None,
191
- interactive=True,
192
- )
193
-
194
- with gr.Column():
195
- precision = gr.Dropdown(
196
- choices=["float16", "bfloat16", "float32", "int8", "int4"],
197
- label="Precision",
198
- multiselect=False,
199
- value="float16",
200
- interactive=True,
201
- )
202
- weight_type = gr.Dropdown(
203
- choices=["Original", "Delta", "Adapter"],
204
- label="Weights type",
205
- multiselect=False,
206
- value="Original",
207
- interactive=True,
208
- )
209
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
210
-
211
- submit_button = gr.Button("Submit Eval")
212
- submission_result = gr.Markdown()
213
- submit_button.click(
214
- add_new_eval,
215
- [
216
- model_name_textbox,
217
- base_model_name_textbox,
218
- revision_name_textbox,
219
- precision,
220
- weight_type,
221
- model_type,
222
- ],
223
- submission_result,
224
- )
225
-
226
- with gr.Row():
227
- with gr.Accordion("📙 Citation", open=False):
228
- citation_button = gr.Textbox(
229
- value=CITATION_BUTTON_TEXT,
230
- label=CITATION_BUTTON_LABEL,
231
- lines=20,
232
- elem_id="citation-button",
233
- show_copy_button=True,
234
- )
235
-
236
- scheduler = BackgroundScheduler()
237
- scheduler.add_job(restart_space, "interval", seconds=1800)
238
- scheduler.start()
239
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
1
  import os
2
+ import glob
3
+ import json
4
+ import pandas as pd
5
+ import gradio as gr
6
+
7
+
8
+ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
9
+ EVAL_RESULTS_DIR = os.path.join(ROOT_DIR, "eval-results")
10
+
11
+
12
+ def load_results(results_dir: str) -> pd.DataFrame:
13
+ rows = []
14
+ all_dataset_keys = set()
15
+
16
+ if not os.path.isdir(results_dir):
17
+ return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
18
+
19
+ # First pass: collect all dataset keys from all files
20
+ for path in glob.glob(os.path.join(results_dir, "*.json")):
21
+ try:
22
+ with open(path, "r", encoding="utf-8") as f:
23
+ data = json.load(f)
24
+ res = data.get("results", {})
25
+ all_dataset_keys.update(res.keys())
26
+ except Exception:
27
+ continue
28
+
29
+ # Use dataset keys directly as display names
30
+ dataset_display_names = {key: key for key in all_dataset_keys}
31
+
32
+ # Second pass: extract data
33
+ for path in glob.glob(os.path.join(results_dir, "*.json")):
34
+ try:
35
+ with open(path, "r", encoding="utf-8") as f:
36
+ data = json.load(f)
37
+ cfg = data.get("config", {})
38
+ res = data.get("results", {})
39
+
40
+ model_name = cfg.get("model_name", "unknown")
41
+
42
+ # Extract PER for each dataset dynamically
43
+ per_values = {}
44
+ dur_values = []
45
+
46
+ for dataset_key in all_dataset_keys:
47
+ dataset_data = res.get(dataset_key, {})
48
+ per_value = dataset_data.get("per") if dataset_data else None
49
+ dur_value = dataset_data.get("avg_duration") if dataset_data else None
50
+
51
+ display_name = dataset_display_names[dataset_key]
52
+ per_values[f"PER {display_name}"] = per_value
53
+
54
+ if dur_value is not None:
55
+ dur_values.append(dur_value)
56
+
57
+ # Calculate average PER across all datasets
58
+ per_vals = [v for v in per_values.values() if v is not None]
59
+ avg_per = sum(per_vals) / len(per_vals) if per_vals else None
60
+
61
+ # Calculate average duration
62
+ avg_dur = sum(dur_values) / len(dur_values) if dur_values else None
63
+
64
+ row = {
65
+ "Model": model_name,
66
+ "Avg PER": avg_per,
67
+ "Avg Duration (s)": avg_dur,
68
+ "_file": os.path.basename(path),
69
+ }
70
+ row.update(per_values)
71
+ rows.append(row)
72
+
73
+ except Exception:
74
+ continue
75
+
76
+ df = pd.DataFrame(rows)
77
+ if df.empty:
78
+ # Create default columns based on discovered datasets
79
+ default_cols = ["Model", "Avg PER", "Avg Duration (s)"]
80
+ for key in sorted(all_dataset_keys):
81
+ display_name = dataset_display_names[key]
82
+ default_cols.insert(-2, f"PER {display_name}")
83
+ return pd.DataFrame(columns=default_cols)
84
+
85
+ df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last")
86
+ return df.reset_index(drop=True)
87
+
88
+
89
+ def build_interface():
90
+ with gr.Blocks() as demo:
91
+ gr.Markdown("# Simple Phoneme Leaderboard")
92
+ info = gr.Markdown(f"Results directory: `{EVAL_RESULTS_DIR}`")
93
+
94
+ # Get initial data to determine columns dynamically
95
+ initial_df = load_results(EVAL_RESULTS_DIR)
96
+ if not initial_df.empty:
97
+ headers = list(initial_df.columns)
98
+ # Remove internal columns
99
+ headers = [h for h in headers if not h.startswith('_')]
100
+ else:
101
+ headers = ["Model", "Avg PER", "Avg Duration (s)"]
102
+
103
+ table = gr.Dataframe(headers=headers, row_count=5)
104
+
105
+ def refresh():
106
+ df = load_results(EVAL_RESULTS_DIR)
107
+ if df.empty:
108
+ return df
109
+
110
+ # Get the column order from the dataframe
111
+ cols = [c for c in df.columns if not c.startswith('_')]
112
+
113
+ # Ensure all columns exist for the dataframe component
114
+ for c in cols:
115
+ if c not in df.columns:
116
+ df[c] = None
117
+ return df[cols].round(3)
118
+
119
+ btn = gr.Button("Refresh")
120
+ btn.click(fn=refresh, outputs=table)
121
+
122
+ # Auto-load on start
123
+ table.value = refresh()
124
+ return demo
125
+
126
+
127
+ if __name__ == "__main__":
128
+ demo = build_interface()
129
+ demo.queue().launch()
130
+
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_default.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+ import os
7
+
8
+ from src.about import (
9
+ CITATION_BUTTON_LABEL,
10
+ CITATION_BUTTON_TEXT,
11
+ EVALUATION_QUEUE_TEXT,
12
+ INTRODUCTION_TEXT,
13
+ LLM_BENCHMARKS_TEXT,
14
+ TITLE,
15
+ )
16
+ from src.display.css_html_js import custom_css
17
+ from src.display.utils import (
18
+ COLS,
19
+ AutoEvalColumn,
20
+ fields,
21
+ )
22
+ from src.about import Tasks
23
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
24
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
25
+ from src.submission.submit import add_new_eval
26
+
27
+ # Import simple leaderboard functionality
28
+ import glob
29
+ import json
30
+ from functools import lru_cache
31
+
32
+
33
+ def restart_space():
34
+ API.restart_space(repo_id=REPO_ID)
35
+
36
+ ### Space initialisation (prefer local JSONs, fall back to Hub)
37
+ def _has_local_json(path: str) -> bool:
38
+ try:
39
+ return os.path.isdir(path) and any(str(f).endswith(".json") for f in os.listdir(path))
40
+ except Exception:
41
+ return False
42
+
43
+ if not _has_local_json(EVAL_REQUESTS_PATH):
44
+ try:
45
+ print(EVAL_REQUESTS_PATH)
46
+ snapshot_download(
47
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
+ )
49
+ except Exception:
50
+ pass
51
+
52
+ if not _has_local_json(EVAL_RESULTS_PATH):
53
+ try:
54
+ print(EVAL_RESULTS_PATH)
55
+ snapshot_download(
56
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
57
+ )
58
+ except Exception:
59
+ pass
60
+
61
+
62
+ # Build benchmark and evaluation queue column metadata
63
+ BENCHMARK_COLS = [f"{task.value.col_name} ({task.name})" for task in Tasks]
64
+
65
+ EVAL_COLS = [
66
+ "Model",
67
+ "Model sha",
68
+ "status",
69
+ "precision",
70
+ "weight_type",
71
+ "model_type",
72
+ "likes",
73
+ "params",
74
+ "license",
75
+ "submitted_time",
76
+ ]
77
+
78
+ EVAL_TYPES = [
79
+ "markdown", # Model
80
+ "str", # Model sha
81
+ "str", # status
82
+ "str", # precision
83
+ "str", # weight_type
84
+ "str", # model_type
85
+ "number", # likes
86
+ "number", # params
87
+ "str", # license
88
+ "str", # submitted_time
89
+ ]
90
+
91
+ # Hide all models from the leaderboard view
92
+ LEADERBOARD_DF = pd.DataFrame(columns=COLS)
93
+
94
+ (
95
+ finished_eval_queue_df,
96
+ running_eval_queue_df,
97
+ pending_eval_queue_df,
98
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
99
+
100
+ @lru_cache(maxsize=1)
101
+ def _get_simple_dataset_keys(results_dir: str) -> tuple:
102
+ """Cache dataset keys to avoid repeated file scanning."""
103
+ all_dataset_keys = set()
104
+ if not os.path.isdir(results_dir):
105
+ return tuple()
106
+
107
+ for path in glob.glob(os.path.join(results_dir, "*.json")):
108
+ try:
109
+ with open(path, "r", encoding="utf-8") as f:
110
+ data = json.load(f)
111
+ res = data.get("results", {})
112
+ all_dataset_keys.update(res.keys())
113
+ except Exception:
114
+ continue
115
+
116
+ return tuple(sorted(all_dataset_keys))
117
+
118
+ def load_simple_results(results_dir: str) -> pd.DataFrame:
119
+ """Load and process evaluation results from JSON files for simple leaderboard with caching."""
120
+ rows = []
121
+ all_dataset_keys = set(_get_simple_dataset_keys(results_dir))
122
+
123
+ if not all_dataset_keys:
124
+ return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
125
+
126
+ # Use dataset keys directly as display names
127
+ dataset_display_names = {key: key for key in all_dataset_keys}
128
+
129
+ # Single pass: extract data with optimized processing
130
+ for path in glob.glob(os.path.join(results_dir, "*.json")):
131
+ try:
132
+ with open(path, "r", encoding="utf-8") as f:
133
+ data = json.load(f)
134
+ cfg = data.get("config", {})
135
+ res = data.get("results", {})
136
+
137
+ model_name = cfg.get("model_name", "unknown")
138
+
139
+ # Extract PER for each dataset dynamically
140
+ per_values = {}
141
+ dur_values = []
142
+
143
+ for dataset_key in all_dataset_keys:
144
+ dataset_data = res.get(dataset_key, {})
145
+ per_value = dataset_data.get("per") if dataset_data else None
146
+ dur_value = dataset_data.get("avg_duration") if dataset_data else None
147
+
148
+ display_name = dataset_display_names[dataset_key]
149
+ per_values[f"PER {display_name}"] = per_value
150
+
151
+ if dur_value is not None:
152
+ dur_values.append(dur_value)
153
+
154
+ # Calculate average PER across all datasets
155
+ per_vals = [v for v in per_values.values() if v is not None]
156
+ avg_per = sum(per_vals) / len(per_vals) if per_vals else None
157
+
158
+ # Calculate average duration
159
+ avg_dur = sum(dur_values) / len(dur_values) if dur_values else None
160
+
161
+ row = {
162
+ "Model": model_name,
163
+ "Avg PER": avg_per,
164
+ "Avg Duration (s)": avg_dur,
165
+ "_file": os.path.basename(path),
166
+ }
167
+ row.update(per_values)
168
+ rows.append(row)
169
+
170
+ except Exception:
171
+ continue
172
+
173
+ df = pd.DataFrame(rows)
174
+ if df.empty:
175
+ # Create default columns based on discovered datasets
176
+ default_cols = ["Model", "Avg PER", "Avg Duration (s)"]
177
+ for key in sorted(all_dataset_keys):
178
+ display_name = dataset_display_names[key]
179
+ default_cols.insert(-2, f"PER {display_name}")
180
+ return pd.DataFrame(columns=default_cols)
181
+
182
+ df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last")
183
+ return df.reset_index(drop=True)
184
+
185
+
186
+ def init_leaderboard(dataframe):
187
+ if dataframe is None or dataframe.empty:
188
+ dataframe = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)])
189
+ return Leaderboard(
190
+ value=dataframe,
191
+ datatype=[c.type for c in fields(AutoEvalColumn)],
192
+ select_columns=SelectColumns(
193
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
194
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
195
+ label="Select Columns to Display:",
196
+ ),
197
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
198
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
199
+ filter_columns=[
200
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
201
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
202
+ ColumnFilter(
203
+ AutoEvalColumn.params.name,
204
+ type="slider",
205
+ min=0.01,
206
+ max=150,
207
+ label="Select the number of parameters (B)",
208
+ ),
209
+ ColumnFilter(
210
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
211
+ ),
212
+ ],
213
+ bool_checkboxgroup_label="Hide models",
214
+ interactive=False,
215
+ )
216
+
217
+
218
+ demo = gr.Blocks(css=custom_css)
219
+ with demo:
220
+ gr.HTML(TITLE)
221
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
222
+
223
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
224
+ with gr.TabItem("🏅 Phoneme Benchmark", elem_id="llm-benchmark-tab-table", id=0):
225
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
226
+
227
+ with gr.TabItem("📊 Simple Results", elem_id="simple-results-tab", id=1):
228
+ gr.Markdown("## 🎯 Phoneme Detection Results")
229
+ gr.Markdown("Compare phoneme recognition models across different datasets")
230
+
231
+ # Stats section for simple results
232
+ with gr.Row():
233
+ simple_total_models = gr.HTML(
234
+ '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>'
235
+ )
236
+ simple_best_per = gr.HTML(
237
+ '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>'
238
+ )
239
+ simple_avg_duration = gr.HTML(
240
+ '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>'
241
+ )
242
+
243
+ # Get initial data to determine columns dynamically
244
+ initial_df = load_simple_results(EVAL_RESULTS_PATH)
245
+ if not initial_df.empty:
246
+ headers = list(initial_df.columns)
247
+ # Remove internal columns
248
+ headers = [h for h in headers if not h.startswith('_')]
249
+ else:
250
+ headers = ["Model", "Avg PER", "Avg Duration (s)"]
251
+
252
+ with gr.Row():
253
+ with gr.Column(scale=4):
254
+ simple_table = gr.Dataframe(
255
+ headers=headers,
256
+ row_count=10,
257
+ label="🏆 Model Performance Leaderboard",
258
+ interactive=False
259
+ )
260
+
261
+ with gr.Column(scale=1):
262
+ refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")
263
+
264
+ # Export options
265
+ with gr.Accordion("📥 Export Data", open=False):
266
+ export_csv = gr.Button("📄 Export CSV", variant="secondary")
267
+ export_json = gr.Button("📋 Export JSON", variant="secondary")
268
+
269
+ def refresh_simple():
270
+ """Refresh the simple leaderboard data with enhanced stats."""
271
+ df = load_simple_results(EVAL_RESULTS_PATH)
272
+
273
+ if df.empty:
274
+ return df, "No data", "No data", "No data"
275
+
276
+ # Get the column order from the dataframe
277
+ cols = [c for c in df.columns if not c.startswith('_')]
278
+
279
+ # Ensure all columns exist for the dataframe component
280
+ for c in cols:
281
+ if c not in df.columns:
282
+ df[c] = None
283
+
284
+ # Calculate enhanced stats
285
+ total_models = len(df)
286
+ best_per_val = df['Avg PER'].min() if 'Avg PER' in df.columns and not df['Avg PER'].isna().all() else "N/A"
287
+ avg_duration_val = df['Avg Duration (s)'].mean() if 'Avg Duration (s)' in df.columns and not df['Avg Duration (s)'].isna().all() else "N/A"
288
+
289
+ # Format stats
290
+ best_per_str = f"{best_per_val:.2f}" if isinstance(best_per_val, (int, float)) else str(best_per_val)
291
+ avg_duration_str = f"{avg_duration_val:.2f}s" if isinstance(avg_duration_val, (int, float)) else str(avg_duration_val)
292
+
293
+ return (
294
+ df[cols].round(3),
295
+ f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{total_models}</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>',
296
+ f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{best_per_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>',
297
+ f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{avg_duration_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>'
298
+ )
299
+
300
+ def export_simple_csv():
301
+ """Export simple results as CSV."""
302
+ df = load_simple_results(EVAL_RESULTS_PATH)
303
+ if df.empty:
304
+ return None
305
+ cols = [c for c in df.columns if not c.startswith('_')]
306
+ return df[cols].round(3)
307
+
308
+ def export_simple_json():
309
+ """Export simple results as JSON."""
310
+ df = load_simple_results(EVAL_RESULTS_PATH)
311
+ if df.empty:
312
+ return None
313
+ cols = [c for c in df.columns if not c.startswith('_')]
314
+ return df[cols].round(3).to_json(orient='records', indent=2)
315
+
316
+ # Connect events
317
+ refresh_btn.click(
318
+ fn=refresh_simple,
319
+ outputs=[simple_table, simple_total_models, simple_best_per, simple_avg_duration]
320
+ )
321
+
322
+ export_csv.click(
323
+ fn=export_simple_csv,
324
+ outputs=gr.File(label="Download CSV")
325
+ )
326
+
327
+ export_json.click(
328
+ fn=export_simple_json,
329
+ outputs=gr.File(label="Download JSON")
330
+ )
331
+
332
+ # Auto-load on start
333
+ simple_table.value, simple_total_models.value, simple_best_per.value, simple_avg_duration.value = refresh_simple()
334
+
335
+ # Enhanced help section
336
+ with gr.Accordion("ℹ️ About this Leaderboard", open=False):
337
+ gr.Markdown("""
338
+ ## 📊 Understanding the Results
339
+
340
+ **Performance Metrics:**
341
+ - **PER (Phoneme Error Rate)**: Lower values indicate better performance
342
+ - **Avg Duration**: Processing time per sample (lower is faster)
343
+ - **Models are ranked by average PER across all datasets**
344
+
345
+ **Datasets Evaluated:**
346
+ - `phoneme_asr`: General phoneme recognition dataset
347
+ - `kids_phoneme_md`: Kids' phoneme recognition dataset
348
+
349
+ **How to Interpret:**
350
+ - **PER**: Percentage of phonemes incorrectly recognized (0% = perfect)
351
+ - **Duration**: Time efficiency (important for real-time applications)
352
+ - **Average PER**: Overall model performance across all datasets
353
+
354
+ **Tips for Model Selection:**
355
+ - Choose models with low PER for accuracy-critical applications
356
+ - Consider duration for real-time or resource-constrained environments
357
+ - Balance between accuracy (PER) and speed (Duration) based on your needs
358
+ """)
359
+
360
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
361
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
362
+
363
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
364
+ with gr.Column():
365
+ with gr.Row():
366
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
367
+
368
+ with gr.Column():
369
+ with gr.Accordion(
370
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
371
+ open=False,
372
+ ):
373
+ with gr.Row():
374
+ finished_eval_table = gr.components.Dataframe(
375
+ value=finished_eval_queue_df,
376
+ headers=EVAL_COLS,
377
+ datatype=EVAL_TYPES,
378
+ row_count=5,
379
+ )
380
+ with gr.Accordion(
381
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
382
+ open=False,
383
+ ):
384
+ with gr.Row():
385
+ running_eval_table = gr.components.Dataframe(
386
+ value=running_eval_queue_df,
387
+ headers=EVAL_COLS,
388
+ datatype=EVAL_TYPES,
389
+ row_count=5,
390
+ )
391
+
392
+ with gr.Accordion(
393
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
394
+ open=False,
395
+ ):
396
+ with gr.Row():
397
+ pending_eval_table = gr.components.Dataframe(
398
+ value=pending_eval_queue_df,
399
+ headers=EVAL_COLS,
400
+ datatype=EVAL_TYPES,
401
+ row_count=5,
402
+ )
403
+ with gr.Row():
404
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
405
+
406
+ with gr.Row():
407
+ with gr.Column():
408
+ model_name_textbox = gr.Textbox(label="Model name")
409
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
410
+ model_type = gr.Dropdown(
411
+ choices=["Pretrained", "Fine-tuned", "Merge", "Other"],
412
+ label="Model type",
413
+ multiselect=False,
414
+ value=None,
415
+ interactive=True,
416
+ )
417
+
418
+ with gr.Column():
419
+ precision = gr.Dropdown(
420
+ choices=["float16", "bfloat16", "float32", "int8", "int4"],
421
+ label="Precision",
422
+ multiselect=False,
423
+ value="float16",
424
+ interactive=True,
425
+ )
426
+ weight_type = gr.Dropdown(
427
+ choices=["Original", "Delta", "Adapter"],
428
+ label="Weights type",
429
+ multiselect=False,
430
+ value="Original",
431
+ interactive=True,
432
+ )
433
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
434
+
435
+ submit_button = gr.Button("Submit Eval")
436
+ submission_result = gr.Markdown()
437
+ submit_button.click(
438
+ add_new_eval,
439
+ [
440
+ model_name_textbox,
441
+ base_model_name_textbox,
442
+ revision_name_textbox,
443
+ precision,
444
+ weight_type,
445
+ model_type,
446
+ ],
447
+ submission_result,
448
+ )
449
+
450
+ with gr.Row():
451
+ with gr.Accordion("📙 Citation", open=False):
452
+ citation_button = gr.Textbox(
453
+ value=CITATION_BUTTON_TEXT,
454
+ label=CITATION_BUTTON_LABEL,
455
+ lines=20,
456
+ elem_id="citation-button",
457
+ show_copy_button=True,
458
+ )
459
+
460
+ scheduler = BackgroundScheduler()
461
+ scheduler.add_job(restart_space, "interval", seconds=1800)
462
+ scheduler.start()
463
+ demo.queue(default_concurrency_limit=40).launch()
simple_leaderboard.py → app_simple.py RENAMED
@@ -3,20 +3,21 @@ import glob
3
  import json
4
  import pandas as pd
5
  import gradio as gr
6
-
 
 
7
 
8
  ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
9
  EVAL_RESULTS_DIR = os.path.join(ROOT_DIR, "eval-results")
10
 
11
 
12
- def load_results(results_dir: str) -> pd.DataFrame:
13
- rows = []
 
14
  all_dataset_keys = set()
15
-
16
  if not os.path.isdir(results_dir):
17
- return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
18
-
19
- # First pass: collect all dataset keys from all files
20
  for path in glob.glob(os.path.join(results_dir, "*.json")):
21
  try:
22
  with open(path, "r", encoding="utf-8") as f:
@@ -25,11 +26,24 @@ def load_results(results_dir: str) -> pd.DataFrame:
25
  all_dataset_keys.update(res.keys())
26
  except Exception:
27
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  # Use dataset keys directly as display names
30
  dataset_display_names = {key: key for key in all_dataset_keys}
31
 
32
- # Second pass: extract data
33
  for path in glob.glob(os.path.join(results_dir, "*.json")):
34
  try:
35
  with open(path, "r", encoding="utf-8") as f:
@@ -87,25 +101,124 @@ def load_results(results_dir: str) -> pd.DataFrame:
87
 
88
 
89
  def build_interface():
90
- with gr.Blocks() as demo:
91
- gr.Markdown("# Simple Phoneme Leaderboard")
92
- info = gr.Markdown(f"Results directory: `{EVAL_RESULTS_DIR}`")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- # Get initial data to determine columns dynamically
95
- initial_df = load_results(EVAL_RESULTS_DIR)
96
- if not initial_df.empty:
97
- headers = list(initial_df.columns)
98
- # Remove internal columns
99
- headers = [h for h in headers if not h.startswith('_')]
100
- else:
101
- headers = ["Model", "Avg PER", "Avg Duration (s)"]
 
 
 
 
 
 
102
 
103
- table = gr.Dataframe(headers=headers, row_count=5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  def refresh():
 
 
106
  df = load_results(EVAL_RESULTS_DIR)
 
107
  if df.empty:
108
- return df
109
 
110
  # Get the column order from the dataframe
111
  cols = [c for c in df.columns if not c.startswith('_')]
@@ -114,18 +227,92 @@ def build_interface():
114
  for c in cols:
115
  if c not in df.columns:
116
  df[c] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  return df[cols].round(3)
118
 
119
- btn = gr.Button("Refresh")
120
- btn.click(fn=refresh, outputs=table)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  # Auto-load on start
123
- table.value = refresh()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  return demo
125
 
126
 
127
  if __name__ == "__main__":
128
  demo = build_interface()
129
- demo.queue().launch()
130
-
131
-
 
 
 
3
  import json
4
  import pandas as pd
5
  import gradio as gr
6
+ from typing import Optional, Dict, List
7
+ import time
8
+ from functools import lru_cache
9
 
10
  ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
11
  EVAL_RESULTS_DIR = os.path.join(ROOT_DIR, "eval-results")
12
 
13
 
14
+ @lru_cache(maxsize=1)
15
+ def _get_dataset_keys(results_dir: str) -> tuple:
16
+ """Cache dataset keys to avoid repeated file scanning."""
17
  all_dataset_keys = set()
 
18
  if not os.path.isdir(results_dir):
19
+ return tuple()
20
+
 
21
  for path in glob.glob(os.path.join(results_dir, "*.json")):
22
  try:
23
  with open(path, "r", encoding="utf-8") as f:
 
26
  all_dataset_keys.update(res.keys())
27
  except Exception:
28
  continue
29
+
30
+ return tuple(sorted(all_dataset_keys))
31
+
32
+ def load_results(results_dir: str) -> pd.DataFrame:
33
+ """
34
+ Load and process evaluation results from JSON files.
35
+ Dynamically handles any number of datasets with caching for performance.
36
+ """
37
+ rows = []
38
+ all_dataset_keys = set(_get_dataset_keys(results_dir))
39
+
40
+ if not all_dataset_keys:
41
+ return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
42
 
43
  # Use dataset keys directly as display names
44
  dataset_display_names = {key: key for key in all_dataset_keys}
45
 
46
+ # Single pass: extract data with optimized processing
47
  for path in glob.glob(os.path.join(results_dir, "*.json")):
48
  try:
49
  with open(path, "r", encoding="utf-8") as f:
 
101
 
102
 
103
  def build_interface():
104
+ """Build the optimized Gradio interface for the phoneme leaderboard."""
105
+
106
+ # Custom CSS for better styling
107
+ custom_css = """
108
+ .gradio-container {
109
+ max-width: 1200px !important;
110
+ margin: 0 auto !important;
111
+ }
112
+ .leaderboard-header {
113
+ text-align: center;
114
+ margin-bottom: 2rem;
115
+ }
116
+ .stats-container {
117
+ display: flex;
118
+ gap: 1rem;
119
+ margin-bottom: 1rem;
120
+ flex-wrap: wrap;
121
+ }
122
+ .stat-card {
123
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
124
+ color: white;
125
+ padding: 1rem;
126
+ border-radius: 10px;
127
+ text-align: center;
128
+ min-width: 150px;
129
+ flex: 1;
130
+ }
131
+ .stat-value {
132
+ font-size: 1.5rem;
133
+ font-weight: bold;
134
+ margin-bottom: 0.5rem;
135
+ }
136
+ .stat-label {
137
+ font-size: 0.9rem;
138
+ opacity: 0.9;
139
+ }
140
+ .table-container {
141
+ margin-top: 1rem;
142
+ }
143
+ .refresh-btn {
144
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
145
+ color: white;
146
+ border: none;
147
+ padding: 0.5rem 1rem;
148
+ border-radius: 5px;
149
+ cursor: pointer;
150
+ }
151
+ """
152
+
153
+ with gr.Blocks(
154
+ title="Phoneme Detection Leaderboard",
155
+ css=custom_css,
156
+ theme=gr.themes.Soft()
157
+ ) as demo:
158
+
159
+ # Header section
160
+ with gr.Column(elem_classes="leaderboard-header"):
161
+ gr.Markdown("# 🎯 Phoneme Detection Leaderboard")
162
+ gr.Markdown("Compare phoneme recognition models across different datasets")
163
 
164
+ # Stats section
165
+ with gr.Row(elem_classes="stats-container"):
166
+ total_models = gr.HTML(
167
+ '<div class="stat-card"><div class="stat-value" id="total-models">-</div><div class="stat-label">Total Models</div></div>',
168
+ elem_id="total-models-card"
169
+ )
170
+ best_per = gr.HTML(
171
+ '<div class="stat-card"><div class="stat-value" id="best-per">-</div><div class="stat-label">Best PER</div></div>',
172
+ elem_id="best-per-card"
173
+ )
174
+ avg_duration = gr.HTML(
175
+ '<div class="stat-card"><div class="stat-value" id="avg-duration">-</div><div class="stat-label">Avg Duration</div></div>',
176
+ elem_id="avg-duration-card"
177
+ )
178
 
179
+ # Main content
180
+ with gr.Row():
181
+ with gr.Column(scale=4):
182
+ # Get initial data to determine columns dynamically
183
+ initial_df = load_results(EVAL_RESULTS_DIR)
184
+ if not initial_df.empty:
185
+ headers = list(initial_df.columns)
186
+ # Remove internal columns
187
+ headers = [h for h in headers if not h.startswith('_')]
188
+ else:
189
+ headers = ["Model", "Avg PER", "Avg Duration (s)"]
190
+
191
+ table = gr.Dataframe(
192
+ headers=headers,
193
+ row_count=10,
194
+ label="🏆 Model Performance Leaderboard",
195
+ interactive=False,
196
+ elem_classes="table-container"
197
+ )
198
+
199
+ with gr.Column(scale=1):
200
+ refresh_btn = gr.Button(
201
+ "🔄 Refresh Data",
202
+ variant="primary",
203
+ elem_classes="refresh-btn"
204
+ )
205
+
206
+ # Quick stats
207
+ with gr.Accordion("📊 Quick Stats", open=True):
208
+ stats_display = gr.HTML("Loading statistics...")
209
+
210
+ # Export options
211
+ with gr.Accordion("📥 Export Data", open=False):
212
+ export_csv = gr.Button("📄 Export as CSV", variant="secondary")
213
+ export_json = gr.Button("📋 Export as JSON", variant="secondary")
214
 
215
  def refresh():
216
+ """Refresh the leaderboard data with performance optimization."""
217
+ start_time = time.time()
218
  df = load_results(EVAL_RESULTS_DIR)
219
+
220
  if df.empty:
221
+ return df, "No data available", "No data available", "No data available"
222
 
223
  # Get the column order from the dataframe
224
  cols = [c for c in df.columns if not c.startswith('_')]
 
227
  for c in cols:
228
  if c not in df.columns:
229
  df[c] = None
230
+
231
+ # Calculate stats
232
+ total_models = len(df)
233
+ best_per_val = df['Avg PER'].min() if 'Avg PER' in df.columns and not df['Avg PER'].isna().all() else "N/A"
234
+ avg_duration_val = df['Avg Duration (s)'].mean() if 'Avg Duration (s)' in df.columns and not df['Avg Duration (s)'].isna().all() else "N/A"
235
+
236
+ # Format stats
237
+ best_per_str = f"{best_per_val:.2f}" if isinstance(best_per_val, (int, float)) else str(best_per_val)
238
+ avg_duration_str = f"{avg_duration_val:.2f}s" if isinstance(avg_duration_val, (int, float)) else str(avg_duration_val)
239
+
240
+ load_time = time.time() - start_time
241
+
242
+ return (
243
+ df[cols].round(3),
244
+ f"<div class='stat-card'><div class='stat-value'>{total_models}</div><div class='stat-label'>Total Models</div></div>",
245
+ f"<div class='stat-card'><div class='stat-value'>{best_per_str}</div><div class='stat-label'>Best PER</div></div>",
246
+ f"<div class='stat-card'><div class='stat-value'>{avg_duration_str}</div><div class='stat-label'>Avg Duration</div></div>"
247
+ )
248
+
249
+ def export_csv_data():
250
+ """Export data as CSV."""
251
+ df = load_results(EVAL_RESULTS_DIR)
252
+ if df.empty:
253
+ return None
254
+ cols = [c for c in df.columns if not c.startswith('_')]
255
  return df[cols].round(3)
256
 
257
+ def export_json_data():
258
+ """Export data as JSON."""
259
+ df = load_results(EVAL_RESULTS_DIR)
260
+ if df.empty:
261
+ return None
262
+ cols = [c for c in df.columns if not c.startswith('_')]
263
+ return df[cols].round(3).to_json(orient='records', indent=2)
264
+
265
+ # Connect events
266
+ refresh_btn.click(
267
+ fn=refresh,
268
+ outputs=[table, total_models, best_per, avg_duration]
269
+ )
270
+
271
+ export_csv.click(
272
+ fn=export_csv_data,
273
+ outputs=gr.File(label="Download CSV")
274
+ )
275
+
276
+ export_json.click(
277
+ fn=export_json_data,
278
+ outputs=gr.File(label="Download JSON")
279
+ )
280
 
281
  # Auto-load on start
282
+ table.value, total_models.value, best_per.value, avg_duration.value = refresh()
283
+
284
+ # Help section
285
+ with gr.Accordion("ℹ️ About this Leaderboard", open=False):
286
+ gr.Markdown("""
287
+ ## 📊 Understanding the Results
288
+
289
+ **Performance Metrics:**
290
+ - **PER (Phoneme Error Rate)**: Lower values indicate better performance
291
+ - **Avg Duration**: Processing time per sample (lower is faster)
292
+ - **Models are ranked by average PER across all datasets**
293
+
294
+ **Datasets Evaluated:**
295
+ - `phoneme_asr`: General phoneme recognition dataset
296
+ - `kids_phoneme_md`: Kids' phoneme recognition dataset
297
+
298
+ **How to Interpret:**
299
+ - **PER**: Percentage of phonemes incorrectly recognized (0% = perfect)
300
+ - **Duration**: Time efficiency (important for real-time applications)
301
+ - **Average PER**: Overall model performance across all datasets
302
+
303
+ **Tips for Model Selection:**
304
+ - Choose models with low PER for accuracy-critical applications
305
+ - Consider duration for real-time or resource-constrained environments
306
+ - Balance between accuracy (PER) and speed (Duration) based on your needs
307
+ """)
308
+
309
  return demo
310
 
311
 
312
  if __name__ == "__main__":
313
  demo = build_interface()
314
+ demo.queue().launch(
315
+ server_name="0.0.0.0",
316
+ server_port=7860,
317
+ share=False
318
+ )
eval-results/results_1759289565_HuBERT-Base.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "local/HuBERT-Base",
4
+ "model_dtype": "float32",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "phoneme_asr": {
9
+ "per": 79.85359813133437,
10
+ "avg_duration": 0.5645037651062011
11
+ },
12
+ "kids_phoneme_md": {
13
+ "per": 71.85295670319688,
14
+ "avg_duration": 1.0543905973434449
15
+ }
16
+ }
17
+ }
eval-results/results_1759289565_HuBERT-fine-tuned.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "local/HuBERT-fine-tuned",
4
+ "model_dtype": "float32",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "phoneme_asr": {
9
+ "per": 2.774112645808511,
10
+ "avg_duration": 0.5711040496826172
11
+ },
12
+ "kids_phoneme_md": {
13
+ "per": 12.210125572986708,
14
+ "avg_duration": 1.0601478815078735
15
+ }
16
+ }
17
+ }
eval-results/results_1759289565_Timit.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "local/Timit",
4
+ "model_dtype": "float32",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "phoneme_asr": {
9
+ "per": 36.477283094931195,
10
+ "avg_duration": 0.554583740234375
11
+ },
12
+ "kids_phoneme_md": {
13
+ "per": 40.59831492610759,
14
+ "avg_duration": 1.0818484544754028
15
+ }
16
+ }
17
+ }
src/about.py CHANGED
@@ -12,9 +12,9 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the results json, metric_key, column name for display
15
- # Replace with your phoneme metrics and datasets
16
- phoneme_dev = Task("phoneme_dev", "per", "Phoneme Dev PER")
17
- phoneme_test = Task("phoneme_test", "per", "Phoneme Test PER")
18
 
19
  NUM_FEWSHOT = 0 # Change with your few shot
20
  # ---------------------------------------------------
@@ -27,7 +27,7 @@ TITLE = """<h1 align="center" id="space-title">Phoneme Detection Leaderboard</h1
27
  # What does your leaderboard evaluate?
28
  INTRODUCTION_TEXT = """
29
  This leaderboard ranks phoneme detection models by average PER (lower is better).
30
- Evaluations aggregate across dev/test splits for a fair comparison.
31
  """
32
 
33
  # Which evaluations are you running? how can people reproduce what you have?
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the results json, metric_key, column name for display
15
+ # Using actual dataset names as keys
16
+ phoneme_asr = Task("phoneme_asr", "per", "PER phoneme_asr")
17
+ kids_phoneme_md = Task("kids_phoneme_md", "per", "PER kids_phoneme_md")
18
 
19
  NUM_FEWSHOT = 0 # Change with your few shot
20
  # ---------------------------------------------------
 
27
  # What does your leaderboard evaluate?
28
  INTRODUCTION_TEXT = """
29
  This leaderboard ranks phoneme detection models by average PER (lower is better).
30
+ Evaluations aggregate across phoneme_asr and kids_phoneme_md datasets for a fair comparison.
31
  """
32
 
33
  # Which evaluations are you running? how can people reproduce what you have?