jing084 commited on
Commit
9997570
·
verified ·
1 Parent(s): 9c35a5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -34
app.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import json
4
  from typing import List, Tuple
5
 
6
- os.environ["GRADIO_LANGUAGE"] = "en"
7
 
8
  RESULT_DIR = os.environ.get("MOECAP_RESULT_DIR")
9
  if not RESULT_DIR:
@@ -30,11 +30,13 @@ def json_to_row(path: str, metrics: dict) -> dict:
30
 
31
  dataset = metrics.get("dataset", "gsm8k")
32
 
33
- method = metrics.get("method", "")
34
- precision = metrics.get("precision", "")
35
- gsm8k_e2e = metrics.get("gsm8k_e2e_s", None)
36
- gsm8k_bs = metrics.get("gsm8k_bs", None)
37
- gsm8k_gpu = metrics.get("gpu_type", "")
 
 
38
 
39
  em = metrics.get("exact_match")
40
  correct = metrics.get("correct")
@@ -42,7 +44,7 @@ def json_to_row(path: str, metrics: dict) -> dict:
42
  if isinstance(correct, (int, float)) and isinstance(total, (int, float)) and total > 0:
43
  acc = correct / total
44
  else:
45
- acc = em
46
 
47
  def pct(x):
48
  return round(x * 100, 2) if isinstance(x, (int, float)) else None
@@ -57,6 +59,7 @@ def json_to_row(path: str, metrics: dict) -> dict:
57
  "Model": model_cell,
58
  "Dataset": dataset,
59
  "Method": method,
 
60
  "Precision": precision,
61
  "GSM8K<br>E2E(s)": f2(gsm8k_e2e),
62
  "GSM8K<br>bs": gsm8k_bs,
@@ -64,12 +67,10 @@ def json_to_row(path: str, metrics: dict) -> dict:
64
  "GSM8K<br>Accuracy(%)": pct(acc),
65
  "GSM8K<br>Decoding T/s": f2(metrics.get("decoding_throughput")),
66
  "GSM8K<br>Prefill T/s": f2(metrics.get("prefill_tp")),
67
-
68
  "GSM8K<br>Prefill<br>S-MBU(%)": pct(metrics.get("prefill_smbu")),
69
  "GSM8K<br>Prefill<br>S-MFU(%)": pct(metrics.get("prefill_smfu")),
70
  "GSM8K<br>Decoding<br>S-MBU(%)": pct(metrics.get("decoding_smbu")),
71
  "GSM8K<br>Decoding<br>S-MFU(%)": pct(metrics.get("decoding_smfu")),
72
-
73
  "TTFT(s)": f2(metrics.get("ttft")),
74
  "TPOT(s)": f2(metrics.get("tpot")),
75
  }
@@ -148,19 +149,24 @@ def build_leaderboard_from_files(files: List[gr.File], prev_rows: list | None =
148
  return summary_md, table_html, all_rows
149
 
150
 
151
-
152
- def load_from_dir(dir_path: str, selected_tasks: List[str] | None = None, force_refresh: bool = False):
153
-
 
 
 
 
 
154
  try:
155
  pattern = f"hf://datasets/{dir_path}/**/*.json"
156
  dl_mode = "force_redownload" if force_refresh else None
157
 
158
  print(f"Fetching from {pattern} (mode={dl_mode})...")
159
  ds = load_dataset(
160
- "json",
161
- data_files={"train": pattern},
162
  split="train",
163
- download_mode=dl_mode
164
  )
165
  except Exception as e:
166
  empty_html = "<p>No files loaded or Dataset not found.</p>"
@@ -183,9 +189,22 @@ def load_from_dir(dir_path: str, selected_tasks: List[str] | None = None, force_
183
  # Dataset filter
184
  if selected_tasks:
185
  df = df[df["Dataset"].isin(selected_tasks)]
186
- if df.empty:
187
- empty_html = "<p>No records found.</p>"
188
- return f"No records found in dataset `{dir_path}` for tasks {selected_tasks}.", empty_html
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  raw_models = set()
191
  for cell in df["Model"].tolist():
@@ -217,8 +236,21 @@ def load_from_dir(dir_path: str, selected_tasks: List[str] | None = None, force_
217
  return summary_md, table_html
218
 
219
 
220
- def auto_refresh_from_dir(dir_path: str, selected_tasks: List[str] | None = None):
221
- return load_from_dir(dir_path, selected_tasks=selected_tasks, force_refresh=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
 
224
  # Gradio UI
@@ -249,14 +281,10 @@ def build_app() -> gr.Blocks:
249
  "- Mathematics Problem-Solving Performance — "
250
  "[**GSM8K**](https://arxiv.org/abs/2110-14168)\n\n"
251
  "### Columns and Metrics\n"
252
- "- Model \n"
253
- "- Dataset \n"
254
- "- Method \n"
255
- "- Precision \n"
256
- "- GSM8K E2E (s) \n"
257
- "- GSM8K Batch Size \n"
258
  "- GPU Type \n"
259
- "- GSM8K Accuracy (%) \n"
260
  "- Decoding Throughput (tokens/s) \n"
261
  "- Prefill Throughput (tokens/s) \n"
262
  "- Prefill S-MBU (%) \n"
@@ -269,27 +297,71 @@ def build_app() -> gr.Blocks:
269
 
270
  with gr.Column(scale=1):
271
  dir_path = gr.Textbox(
272
- label="HF Dataset Repo ID",
273
- value=RESULT_DIR,
274
  lines=1,
275
- placeholder="username/dataset-name"
276
  )
277
- # Tasks filter
278
  task_filter = gr.CheckboxGroup(
279
  label="Tasks",
280
  choices=["gsm8k", "arena_hard", "mmlu", "NuminaMath"],
281
  value=["gsm8k", "arena_hard", "mmlu", "NuminaMath"],
282
  )
283
-
284
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
  summary_output = gr.Markdown(label="Directory Summary")
287
  leaderboard_output = gr.HTML(label="Directory Metrics")
288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  timer = gr.Timer(10.0)
290
  timer.tick(
291
  fn=auto_refresh_from_dir,
292
- inputs=[dir_path, task_filter],
293
  outputs=[summary_output, leaderboard_output],
294
  )
295
 
 
3
  import json
4
  from typing import List, Tuple
5
 
6
+ os.environ["GRADIO_LANGUAGE"] = "en"
7
 
8
  RESULT_DIR = os.environ.get("MOECAP_RESULT_DIR")
9
  if not RESULT_DIR:
 
30
 
31
  dataset = metrics.get("dataset", "gsm8k")
32
 
33
+ method = metrics.get("method", "")
34
+ precision = metrics.get("precision", "")
35
+ model_type = metrics.get("model_type", "")
36
+
37
+ gsm8k_e2e = metrics.get("gsm8k_e2e_s", None)
38
+ gsm8k_bs = metrics.get("gsm8k_bs", None)
39
+ gsm8k_gpu = metrics.get("gpu_type", "")
40
 
41
  em = metrics.get("exact_match")
42
  correct = metrics.get("correct")
 
44
  if isinstance(correct, (int, float)) and isinstance(total, (int, float)) and total > 0:
45
  acc = correct / total
46
  else:
47
+ acc = em
48
 
49
  def pct(x):
50
  return round(x * 100, 2) if isinstance(x, (int, float)) else None
 
59
  "Model": model_cell,
60
  "Dataset": dataset,
61
  "Method": method,
62
+ "Model type": model_type,
63
  "Precision": precision,
64
  "GSM8K<br>E2E(s)": f2(gsm8k_e2e),
65
  "GSM8K<br>bs": gsm8k_bs,
 
67
  "GSM8K<br>Accuracy(%)": pct(acc),
68
  "GSM8K<br>Decoding T/s": f2(metrics.get("decoding_throughput")),
69
  "GSM8K<br>Prefill T/s": f2(metrics.get("prefill_tp")),
 
70
  "GSM8K<br>Prefill<br>S-MBU(%)": pct(metrics.get("prefill_smbu")),
71
  "GSM8K<br>Prefill<br>S-MFU(%)": pct(metrics.get("prefill_smfu")),
72
  "GSM8K<br>Decoding<br>S-MBU(%)": pct(metrics.get("decoding_smbu")),
73
  "GSM8K<br>Decoding<br>S-MFU(%)": pct(metrics.get("decoding_smfu")),
 
74
  "TTFT(s)": f2(metrics.get("ttft")),
75
  "TPOT(s)": f2(metrics.get("tpot")),
76
  }
 
149
  return summary_md, table_html, all_rows
150
 
151
 
152
+ def load_from_dir(
153
+ dir_path: str,
154
+ selected_tasks: List[str] | None = None,
155
+ selected_frameworks: List[str] | None = None,
156
+ selected_model_types: List[str] | None = None,
157
+ selected_precisions: List[str] | None = None,
158
+ force_refresh: bool = False,
159
+ ):
160
  try:
161
  pattern = f"hf://datasets/{dir_path}/**/*.json"
162
  dl_mode = "force_redownload" if force_refresh else None
163
 
164
  print(f"Fetching from {pattern} (mode={dl_mode})...")
165
  ds = load_dataset(
166
+ "json",
167
+ data_files={"train": pattern},
168
  split="train",
169
+ download_mode=dl_mode,
170
  )
171
  except Exception as e:
172
  empty_html = "<p>No files loaded or Dataset not found.</p>"
 
189
  # Dataset filter
190
  if selected_tasks:
191
  df = df[df["Dataset"].isin(selected_tasks)]
192
+
193
+ # Inference framework filter (Method)
194
+ if selected_frameworks:
195
+ df = df[df["Method"].isin(selected_frameworks)]
196
+
197
+ # Model type filter
198
+ if selected_model_types:
199
+ df = df[df["Model type"].isin(selected_model_types)]
200
+
201
+ # Precision filter
202
+ if selected_precisions:
203
+ df = df[df["Precision"].isin(selected_precisions)]
204
+
205
+ if df.empty:
206
+ empty_html = "<p>No records found.</p>"
207
+ return f"No records found in dataset `{dir_path}` after filtering.", empty_html
208
 
209
  raw_models = set()
210
  for cell in df["Model"].tolist():
 
236
  return summary_md, table_html
237
 
238
 
239
+ def auto_refresh_from_dir(
240
+ dir_path: str,
241
+ selected_tasks: List[str] | None = None,
242
+ selected_frameworks: List[str] | None = None,
243
+ selected_model_types: List[str] | None = None,
244
+ selected_precisions: List[str] | None = None,
245
+ ):
246
+ return load_from_dir(
247
+ dir_path,
248
+ selected_tasks=selected_tasks,
249
+ selected_frameworks=selected_frameworks,
250
+ selected_model_types=selected_model_types,
251
+ selected_precisions=selected_precisions,
252
+ force_refresh=True,
253
+ )
254
 
255
 
256
  # Gradio UI
 
281
  "- Mathematics Problem-Solving Performance — "
282
  "[**GSM8K**](https://arxiv.org/abs/2110-14168)\n\n"
283
  "### Columns and Metrics\n"
284
+ "- End-to-End Latency (s) \n"
285
+ "- Batch Size \n"
 
 
 
 
286
  "- GPU Type \n"
287
+ "- Accuracy (%) \n"
288
  "- Decoding Throughput (tokens/s) \n"
289
  "- Prefill Throughput (tokens/s) \n"
290
  "- Prefill S-MBU (%) \n"
 
297
 
298
  with gr.Column(scale=1):
299
  dir_path = gr.Textbox(
300
+ label="HF Dataset Repo ID",
301
+ value=RESULT_DIR,
302
  lines=1,
303
+ placeholder="username/dataset-name",
304
  )
305
+ # 1) Tasks filter
306
  task_filter = gr.CheckboxGroup(
307
  label="Tasks",
308
  choices=["gsm8k", "arena_hard", "mmlu", "NuminaMath"],
309
  value=["gsm8k", "arena_hard", "mmlu", "NuminaMath"],
310
  )
311
+ # 2) Inference frameworks filter
312
+ framework_filter = gr.CheckboxGroup(
313
+ label="Inference frameworks",
314
+ choices=["sglang", "vllm"],
315
+ value=["sglang", "vllm"],
316
+ )
317
+ # 3) Model types filter
318
+ model_type_filter = gr.CheckboxGroup(
319
+ label="Model types",
320
+ choices=["instruct", "thinking"],
321
+ value=["instruct", "thinking"],
322
+ )
323
+ # 4) Precision filter
324
+ precision_filter = gr.CheckboxGroup(
325
+ label="Precision",
326
+ choices=["bfloat16", "fp8"],
327
+ value=["bfloat16", "fp8"],
328
+ )
329
+ load_dir_button = gr.Button("Load from Dataset")
330
 
331
  summary_output = gr.Markdown(label="Directory Summary")
332
  leaderboard_output = gr.HTML(label="Directory Metrics")
333
 
334
+ load_dir_button.click(
335
+ fn=load_from_dir,
336
+ inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
337
+ outputs=[summary_output, leaderboard_output],
338
+ )
339
+
340
+ task_filter.change(
341
+ fn=load_from_dir,
342
+ inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
343
+ outputs=[summary_output, leaderboard_output],
344
+ )
345
+ framework_filter.change(
346
+ fn=load_from_dir,
347
+ inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
348
+ outputs=[summary_output, leaderboard_output],
349
+ )
350
+ model_type_filter.change(
351
+ fn=load_from_dir,
352
+ inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
353
+ outputs=[summary_output, leaderboard_output],
354
+ )
355
+ precision_filter.change(
356
+ fn=load_from_dir,
357
+ inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
358
+ outputs=[summary_output, leaderboard_output],
359
+ )
360
+
361
  timer = gr.Timer(10.0)
362
  timer.tick(
363
  fn=auto_refresh_from_dir,
364
+ inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
365
  outputs=[summary_output, leaderboard_output],
366
  )
367