AppleSwing commited on
Commit
1d0cc7f
Β·
verified Β·
1 Parent(s): 545e209

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +367 -102
app.py CHANGED
@@ -193,7 +193,6 @@ def load_from_dir(
193
  lower_selected = [x.lower() for x in selected_tasks]
194
  df = df[df["Dataset"].astype(str).str.lower().isin(lower_selected)]
195
 
196
-
197
  # Inference framework filter (Method)
198
  if selected_frameworks is not None:
199
  lower_selected = [str(x).lower() for x in selected_frameworks]
@@ -216,7 +215,6 @@ def load_from_dir(
216
  df = df.fillna("-")
217
  raw_models = set()
218
 
219
-
220
  for cell in df["Model"].tolist():
221
  if isinstance(cell, str) and "href" in cell:
222
  try:
@@ -236,11 +234,6 @@ def load_from_dir(
236
  links.append(str(name))
237
  models_str = ", ".join(links)
238
 
239
- # summary_md = (
240
- # f"**Loaded {len(df)} result files from dataset `{dir_path}`.** \n"
241
- # f"**Models:** {models_str}"
242
- # )
243
-
244
  table_html = df.to_html(escape=False, index=False, classes="metrics-table")
245
  return table_html
246
 
@@ -265,131 +258,403 @@ def auto_refresh_from_dir(
265
  # Gradio UI
266
 
267
  def build_app() -> gr.Blocks:
268
- row_css = """
269
- .gradio-container table.metrics-table th,
270
- .gradio-container table.metrics-table td {
271
- padding-top: 10px;
272
- padding-bottom: 10px;
273
- padding-left: 8px;
274
- padding-right: 8px;
275
- border: 1px solid #e5e7eb;
276
- }
277
- .gradio-container table.metrics-table {
278
- border-collapse: collapse;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  width: 100%;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  }
281
  """
282
 
283
- with gr.Blocks(title="MoE-CAP Dashboard", css=row_css) as demo:
284
- gr.Markdown("# MoE-CAP Dashboard")
285
-
286
- with gr.Row():
287
- with gr.Column(scale=1):
288
  gr.Markdown(
289
- "### Tasks\n"
290
- "- Mathematics Problem-Solving Performance β€” "
291
- "[**GSM8K**](https://arxiv.org/abs/2110-14168)\n\n"
292
- "- Long-Context Understanding β€” "
293
- "[**LongBench**](https://arxiv.org/abs/2412.15204)\n"
294
- "- Massive Multitask Language Understanding β€” "
295
- "[**MMLU**](https://arxiv.org/abs/2009.03300)\n"
296
- "- Mathematical Reasoning β€” "
297
- "[**NuminaMath**](http://faculty.bicmr.pku.edu.cn/~dongbin/Publications/numina_dataset.pdf)\n"
298
- "- Extreme Long-Context Evaluation β€” "
299
- "[**RULER**](https://arxiv.org/abs/2404.06654)\n\n"
300
-
301
- "### Columns and Metrics\n"
302
- "- End-to-End Latency (s) \n"
303
- "- Batch Size \n"
304
- "- GPU Type \n"
305
- "- Accuracy (%) \n"
306
- "- Cost ($) \n"
307
- "- Decoding Throughput (tokens/s) \n"
308
- "- Prefill Throughput (tokens/s) \n"
309
- "- Prefill S-MBU (%) \n"
310
- "- Prefill S-MFU (%) \n"
311
- "- Decoding S-MBU (%) \n"
312
- "- Decoding S-MFU (%) \n"
313
- "- TTFT (s) \n"
314
- "- TPOT (s)"
315
  )
316
-
317
- with gr.Column(scale=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
- dir_path = gr.State(RESULT_DIR)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
- # 1) Tasks filter
322
- task_filter = gr.CheckboxGroup(
323
- label="Tasks",
324
- choices=[
325
- ("GSM8K", "gsm8k"),
326
- ("LongBench", "longbench"),
327
- ("MMLU", "mmlu"),
328
- ("NuminaMath", "numinamath"),
329
- ("RULER", "ruler")
330
- ],
331
- value=["gsm8k", "longbench", "mmlu", "numinamath", "ruler"]
332
- )
333
-
334
- # 2) Inference frameworks filter
335
- framework_filter = gr.CheckboxGroup(
336
- label="Inference frameworks",
337
- choices=["sglang", "vllm"],
338
- value=["sglang", "vllm"],
339
- )
340
- # 3) Model types filter
341
- model_type_filter = gr.CheckboxGroup(
342
- label="Model types",
343
- choices=["instruct", "thinking"],
344
- value=["instruct", "thinking"],
345
- )
346
- # 4) Precision filter
347
- precision_filter = gr.CheckboxGroup(
348
- label="Precision",
349
- choices=["bfloat16", "fp8"],
350
- value=["bfloat16", "fp8"],
351
- )
352
-
353
- # summary_output = gr.Markdown(label="Directory Summary")
354
- leaderboard_output = gr.HTML(label="Directory Metrics")
355
-
356
- # demo.load(
357
- # fn=load_from_dir,
358
- # inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
359
- # outputs=[leaderboard_output],
360
- # )
361
-
362
  demo.load(
363
- fn=auto_refresh_from_dir,
364
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
365
  outputs=[leaderboard_output],
366
  )
367
-
368
-
369
  task_filter.change(
370
- fn=load_from_dir,
371
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
372
  outputs=[leaderboard_output],
373
  )
374
  framework_filter.change(
375
- fn=load_from_dir,
376
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
377
  outputs=[leaderboard_output],
378
  )
379
  model_type_filter.change(
380
- fn=load_from_dir,
381
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
382
  outputs=[leaderboard_output],
383
  )
384
  precision_filter.change(
385
- fn=load_from_dir,
386
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
387
  outputs=[leaderboard_output],
388
  )
389
-
 
390
  timer = gr.Timer(60.0)
391
  timer.tick(
392
- fn=auto_refresh_from_dir,
393
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
394
  outputs=[leaderboard_output],
395
  )
 
193
  lower_selected = [x.lower() for x in selected_tasks]
194
  df = df[df["Dataset"].astype(str).str.lower().isin(lower_selected)]
195
 
 
196
  # Inference framework filter (Method)
197
  if selected_frameworks is not None:
198
  lower_selected = [str(x).lower() for x in selected_frameworks]
 
215
  df = df.fillna("-")
216
  raw_models = set()
217
 
 
218
  for cell in df["Model"].tolist():
219
  if isinstance(cell, str) and "href" in cell:
220
  try:
 
234
  links.append(str(name))
235
  models_str = ", ".join(links)
236
 
 
 
 
 
 
237
  table_html = df.to_html(escape=False, index=False, classes="metrics-table")
238
  return table_html
239
 
 
258
  # Gradio UI
259
 
260
  def build_app() -> gr.Blocks:
261
+ # Enhanced CSS with better layout and scrollable table
262
+ custom_css = """
263
+ /* Global container styling */
264
+ .gradio-container {
265
+ max-width: 100% !important;
266
+ padding: 0 !important;
267
+ }
268
+
269
+ /* Header styling */
270
+ .header-container {
271
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
272
+ padding: 1.5rem 2rem;
273
+ margin: 0;
274
+ border-radius: 0;
275
+ color: white;
276
+ box-shadow: 0 2px 10px rgba(0,0,0,0.1);
277
+ }
278
+
279
+ .header-container h1 {
280
+ color: white !important;
281
+ margin: 0;
282
+ font-size: 2rem;
283
+ font-weight: 600;
284
+ }
285
+
286
+ .header-subtitle {
287
+ color: rgba(255,255,255,0.9);
288
+ margin-top: 0.5rem;
289
+ font-size: 0.95rem;
290
+ }
291
+
292
+ /* Main content area */
293
+ .main-content {
294
+ display: flex;
295
+ height: calc(100vh - 120px);
296
+ gap: 1rem;
297
+ padding: 1rem;
298
+ background: #f8f9fa;
299
+ }
300
+
301
+ /* Sidebar styling */
302
+ .sidebar-container {
303
+ background: white;
304
+ border-radius: 8px;
305
+ padding: 1.5rem;
306
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
307
+ overflow-y: auto;
308
+ max-height: 100%;
309
+ width: 350px;
310
+ flex-shrink: 0;
311
+ }
312
+
313
+ .sidebar-section {
314
+ margin-bottom: 1.5rem;
315
+ }
316
+
317
+ .sidebar-section h3 {
318
+ font-size: 1.1rem;
319
+ font-weight: 600;
320
+ color: #2d3748;
321
+ margin-bottom: 0.75rem;
322
+ padding-bottom: 0.5rem;
323
+ border-bottom: 2px solid #e2e8f0;
324
+ }
325
+
326
+ /* Filter styling */
327
+ .filter-group {
328
+ background: #f7fafc;
329
+ border-radius: 6px;
330
+ padding: 0.75rem;
331
+ margin-bottom: 1rem;
332
+ }
333
+
334
+ .filter-group label {
335
+ font-weight: 500;
336
+ color: #4a5568;
337
+ font-size: 0.9rem;
338
+ margin-bottom: 0.5rem;
339
+ display: block;
340
+ }
341
+
342
+ /* Table container */
343
+ .table-container {
344
+ flex: 1;
345
+ background: white;
346
+ border-radius: 8px;
347
+ padding: 1.5rem;
348
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
349
+ display: flex;
350
+ flex-direction: column;
351
+ min-width: 0;
352
+ }
353
+
354
+ /* Stats bar */
355
+ .stats-bar {
356
+ display: flex;
357
+ gap: 2rem;
358
+ padding: 1rem;
359
+ background: #f7fafc;
360
+ border-radius: 6px;
361
+ margin-bottom: 1rem;
362
+ align-items: center;
363
+ }
364
+
365
+ .stat-item {
366
+ display: flex;
367
+ flex-direction: column;
368
+ }
369
+
370
+ .stat-label {
371
+ font-size: 0.8rem;
372
+ color: #718096;
373
+ text-transform: uppercase;
374
+ letter-spacing: 0.05em;
375
+ }
376
+
377
+ .stat-value {
378
+ font-size: 1.5rem;
379
+ font-weight: 600;
380
+ color: #2d3748;
381
+ }
382
+
383
+ /* Scrollable table wrapper */
384
+ .table-wrapper {
385
+ flex: 1;
386
+ overflow: auto;
387
+ border: 1px solid #e2e8f0;
388
+ border-radius: 6px;
389
+ max-height: calc(100vh - 280px);
390
+ }
391
+
392
+ /* Table styling */
393
+ table.metrics-table {
394
  width: 100%;
395
+ border-collapse: separate;
396
+ border-spacing: 0;
397
+ font-size: 0.9rem;
398
+ }
399
+
400
+ table.metrics-table thead {
401
+ position: sticky;
402
+ top: 0;
403
+ background: linear-gradient(to bottom, #f7fafc, #edf2f7);
404
+ z-index: 10;
405
+ }
406
+
407
+ table.metrics-table th {
408
+ padding: 0.75rem;
409
+ text-align: left;
410
+ font-weight: 600;
411
+ color: #2d3748;
412
+ border-bottom: 2px solid #cbd5e0;
413
+ white-space: nowrap;
414
+ font-size: 0.85rem;
415
+ text-transform: uppercase;
416
+ letter-spacing: 0.05em;
417
+ }
418
+
419
+ table.metrics-table td {
420
+ padding: 0.75rem;
421
+ border-bottom: 1px solid #e2e8f0;
422
+ color: #4a5568;
423
+ }
424
+
425
+ table.metrics-table tbody tr:hover {
426
+ background-color: #f7fafc;
427
+ transition: background-color 0.2s;
428
+ }
429
+
430
+ table.metrics-table tbody tr:last-child td {
431
+ border-bottom: none;
432
+ }
433
+
434
+ /* Model links */
435
+ table.metrics-table a {
436
+ color: #4c6ef5;
437
+ text-decoration: none;
438
+ font-weight: 500;
439
+ }
440
+
441
+ table.metrics-table a:hover {
442
+ text-decoration: underline;
443
+ }
444
+
445
+ /* Empty state */
446
+ .empty-state {
447
+ display: flex;
448
+ flex-direction: column;
449
+ align-items: center;
450
+ justify-content: center;
451
+ height: 400px;
452
+ color: #718096;
453
+ }
454
+
455
+ .empty-state p {
456
+ font-size: 1.1rem;
457
+ margin-top: 1rem;
458
+ }
459
+
460
+ /* Responsive adjustments */
461
+ @media (max-width: 1024px) {
462
+ .main-content {
463
+ flex-direction: column;
464
+ height: auto;
465
+ }
466
+
467
+ .sidebar-container {
468
+ width: 100%;
469
+ max-height: none;
470
+ }
471
+
472
+ .table-wrapper {
473
+ max-height: 500px;
474
+ }
475
+ }
476
+
477
+ /* Checkbox group styling */
478
+ .gradio-checkbox-group {
479
+ display: flex;
480
+ flex-direction: column;
481
+ gap: 0.5rem;
482
+ }
483
+
484
+ .gradio-checkbox-group label {
485
+ display: flex;
486
+ align-items: center;
487
+ padding: 0.25rem;
488
+ border-radius: 4px;
489
+ transition: background-color 0.2s;
490
+ }
491
+
492
+ .gradio-checkbox-group label:hover {
493
+ background-color: #edf2f7;
494
+ }
495
+
496
+ /* Loading indicator */
497
+ .loading-indicator {
498
+ display: flex;
499
+ align-items: center;
500
+ justify-content: center;
501
+ padding: 2rem;
502
+ color: #718096;
503
+ }
504
+
505
+ /* Hide Gradio footer */
506
+ footer {
507
+ display: none !important;
508
  }
509
  """
510
 
511
+ with gr.Blocks(title="MoE-CAP Dashboard", css=custom_css) as demo:
512
+ # Header
513
+ with gr.Row(elem_classes="header-container"):
514
+ with gr.Column():
 
515
  gr.Markdown(
516
+ """# πŸš€ MoE-CAP Dashboard
517
+ <div class="header-subtitle">Comprehensive Model Performance Metrics and Benchmarks</div>
518
+ """,
519
+ elem_classes="header-title"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  )
521
+
522
+ # Main content area
523
+ with gr.Row(elem_classes="main-content"):
524
+ # Sidebar
525
+ with gr.Column(scale=1, elem_classes="sidebar-container"):
526
+ # Filters section
527
+ with gr.Group(elem_classes="sidebar-section"):
528
+ gr.Markdown("### 🎯 Filters", elem_classes="filter-header")
529
+
530
+ dir_path = gr.State(RESULT_DIR)
531
+
532
+ # Task filter
533
+ with gr.Group(elem_classes="filter-group"):
534
+ task_filter = gr.CheckboxGroup(
535
+ label="πŸ“Š Tasks",
536
+ choices=[
537
+ ("GSM8K", "gsm8k"),
538
+ ("LongBench", "longbench"),
539
+ ("MMLU", "mmlu"),
540
+ ("NuminaMath", "numinamath"),
541
+ ("RULER", "ruler")
542
+ ],
543
+ value=["gsm8k", "longbench", "mmlu", "numinamath", "ruler"]
544
+ )
545
+
546
+ # Framework filter
547
+ with gr.Group(elem_classes="filter-group"):
548
+ framework_filter = gr.CheckboxGroup(
549
+ label="βš™οΈ Inference Frameworks",
550
+ choices=["sglang", "vllm"],
551
+ value=["sglang", "vllm"],
552
+ )
553
+
554
+ # Model type filter
555
+ with gr.Group(elem_classes="filter-group"):
556
+ model_type_filter = gr.CheckboxGroup(
557
+ label="πŸ€– Model Types",
558
+ choices=["instruct", "thinking"],
559
+ value=["instruct", "thinking"],
560
+ )
561
+
562
+ # Precision filter
563
+ with gr.Group(elem_classes="filter-group"):
564
+ precision_filter = gr.CheckboxGroup(
565
+ label="🎚️ Precision",
566
+ choices=["bfloat16", "fp8"],
567
+ value=["bfloat16", "fp8"],
568
+ )
569
 
570
+ # Information section
571
+ with gr.Group(elem_classes="sidebar-section"):
572
+ gr.Markdown("### πŸ“– About")
573
+ gr.Markdown(
574
+ """
575
+ **Benchmarks:**
576
+ - [GSM8K](https://arxiv.org/abs/2110.14168) - Math Problem-Solving
577
+ - [LongBench](https://arxiv.org/abs/2412.15204) - Long-Context Understanding
578
+ - [MMLU](https://arxiv.org/abs/2009.03300) - Multitask Understanding
579
+ - [NuminaMath](http://faculty.bicmr.pku.edu.cn/~dongbin/Publications/numina_dataset.pdf) - Mathematical Reasoning
580
+ - [RULER](https://arxiv.org/abs/2404.06654) - Extreme Long-Context
581
+
582
+ **Key Metrics:**
583
+ - E2E Latency, Throughput, Accuracy
584
+ - S-MBU/S-MFU Performance
585
+ - TTFT/TPOT Timing
586
+ """,
587
+ elem_classes="info-text"
588
+ )
589
+
590
+ # Table area
591
+ with gr.Column(scale=3, elem_classes="table-container"):
592
+ # Stats summary (optional - you can populate this with actual stats)
593
+ with gr.Row(elem_classes="stats-bar", visible=False):
594
+ with gr.Column(elem_classes="stat-item"):
595
+ gr.Markdown('<div class="stat-label">Total Models</div><div class="stat-value">0</div>')
596
+ with gr.Column(elem_classes="stat-item"):
597
+ gr.Markdown('<div class="stat-label">Avg Accuracy</div><div class="stat-value">0%</div>')
598
+ with gr.Column(elem_classes="stat-item"):
599
+ gr.Markdown('<div class="stat-label">Best E2E</div><div class="stat-value">0s</div>')
600
 
601
+ # Scrollable table
602
+ with gr.Row():
603
+ with gr.Column():
604
+ gr.Markdown("### πŸ“Š Performance Metrics")
605
+ leaderboard_output = gr.HTML(
606
+ label="Metrics Table",
607
+ elem_classes="table-wrapper"
608
+ )
609
+
610
+ # Wrap table HTML in scrollable div
611
+ def wrap_table_html(html):
612
+ if html and "table" in html:
613
+ return f'<div class="table-wrapper">{html}</div>'
614
+ return html
615
+
616
+ # Modified load function to wrap table
617
+ def load_from_dir_wrapped(*args, **kwargs):
618
+ result = load_from_dir(*args, **kwargs)
619
+ return wrap_table_html(result)
620
+
621
+ def auto_refresh_from_dir_wrapped(*args, **kwargs):
622
+ result = auto_refresh_from_dir(*args, **kwargs)
623
+ return wrap_table_html(result)
624
+
625
+ # Load initial data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
626
  demo.load(
627
+ fn=auto_refresh_from_dir_wrapped,
628
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
629
  outputs=[leaderboard_output],
630
  )
631
+
632
+ # Filter change handlers
633
  task_filter.change(
634
+ fn=load_from_dir_wrapped,
635
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
636
  outputs=[leaderboard_output],
637
  )
638
  framework_filter.change(
639
+ fn=load_from_dir_wrapped,
640
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
641
  outputs=[leaderboard_output],
642
  )
643
  model_type_filter.change(
644
+ fn=load_from_dir_wrapped,
645
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
646
  outputs=[leaderboard_output],
647
  )
648
  precision_filter.change(
649
+ fn=load_from_dir_wrapped,
650
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
651
  outputs=[leaderboard_output],
652
  )
653
+
654
+ # Auto-refresh timer
655
  timer = gr.Timer(60.0)
656
  timer.tick(
657
+ fn=auto_refresh_from_dir_wrapped,
658
  inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
659
  outputs=[leaderboard_output],
660
  )