Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -193,7 +193,6 @@ def load_from_dir(
|
|
| 193 |
lower_selected = [x.lower() for x in selected_tasks]
|
| 194 |
df = df[df["Dataset"].astype(str).str.lower().isin(lower_selected)]
|
| 195 |
|
| 196 |
-
|
| 197 |
# Inference framework filter (Method)
|
| 198 |
if selected_frameworks is not None:
|
| 199 |
lower_selected = [str(x).lower() for x in selected_frameworks]
|
|
@@ -216,7 +215,6 @@ def load_from_dir(
|
|
| 216 |
df = df.fillna("-")
|
| 217 |
raw_models = set()
|
| 218 |
|
| 219 |
-
|
| 220 |
for cell in df["Model"].tolist():
|
| 221 |
if isinstance(cell, str) and "href" in cell:
|
| 222 |
try:
|
|
@@ -236,11 +234,6 @@ def load_from_dir(
|
|
| 236 |
links.append(str(name))
|
| 237 |
models_str = ", ".join(links)
|
| 238 |
|
| 239 |
-
# summary_md = (
|
| 240 |
-
# f"**Loaded {len(df)} result files from dataset `{dir_path}`.** \n"
|
| 241 |
-
# f"**Models:** {models_str}"
|
| 242 |
-
# )
|
| 243 |
-
|
| 244 |
table_html = df.to_html(escape=False, index=False, classes="metrics-table")
|
| 245 |
return table_html
|
| 246 |
|
|
@@ -265,131 +258,403 @@ def auto_refresh_from_dir(
|
|
| 265 |
# Gradio UI
|
| 266 |
|
| 267 |
def build_app() -> gr.Blocks:
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
padding
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
.
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
width: 100%;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
}
|
| 281 |
"""
|
| 282 |
|
| 283 |
-
with gr.Blocks(title="MoE-CAP Dashboard", css=
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
with gr.Column(scale=1):
|
| 288 |
gr.Markdown(
|
| 289 |
-
"
|
| 290 |
-
"-
|
| 291 |
-
"
|
| 292 |
-
"-
|
| 293 |
-
"[**LongBench**](https://arxiv.org/abs/2412.15204)\n"
|
| 294 |
-
"- Massive Multitask Language Understanding β "
|
| 295 |
-
"[**MMLU**](https://arxiv.org/abs/2009.03300)\n"
|
| 296 |
-
"- Mathematical Reasoning β "
|
| 297 |
-
"[**NuminaMath**](http://faculty.bicmr.pku.edu.cn/~dongbin/Publications/numina_dataset.pdf)\n"
|
| 298 |
-
"- Extreme Long-Context Evaluation β "
|
| 299 |
-
"[**RULER**](https://arxiv.org/abs/2404.06654)\n\n"
|
| 300 |
-
|
| 301 |
-
"### Columns and Metrics\n"
|
| 302 |
-
"- End-to-End Latency (s) \n"
|
| 303 |
-
"- Batch Size \n"
|
| 304 |
-
"- GPU Type \n"
|
| 305 |
-
"- Accuracy (%) \n"
|
| 306 |
-
"- Cost ($) \n"
|
| 307 |
-
"- Decoding Throughput (tokens/s) \n"
|
| 308 |
-
"- Prefill Throughput (tokens/s) \n"
|
| 309 |
-
"- Prefill S-MBU (%) \n"
|
| 310 |
-
"- Prefill S-MFU (%) \n"
|
| 311 |
-
"- Decoding S-MBU (%) \n"
|
| 312 |
-
"- Decoding S-MFU (%) \n"
|
| 313 |
-
"- TTFT (s) \n"
|
| 314 |
-
"- TPOT (s)"
|
| 315 |
)
|
| 316 |
-
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
-
#
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
(
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
# 4) Precision filter
|
| 347 |
-
precision_filter = gr.CheckboxGroup(
|
| 348 |
-
label="Precision",
|
| 349 |
-
choices=["bfloat16", "fp8"],
|
| 350 |
-
value=["bfloat16", "fp8"],
|
| 351 |
-
)
|
| 352 |
-
|
| 353 |
-
# summary_output = gr.Markdown(label="Directory Summary")
|
| 354 |
-
leaderboard_output = gr.HTML(label="Directory Metrics")
|
| 355 |
-
|
| 356 |
-
# demo.load(
|
| 357 |
-
# fn=load_from_dir,
|
| 358 |
-
# inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 359 |
-
# outputs=[leaderboard_output],
|
| 360 |
-
# )
|
| 361 |
-
|
| 362 |
demo.load(
|
| 363 |
-
fn=
|
| 364 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 365 |
outputs=[leaderboard_output],
|
| 366 |
)
|
| 367 |
-
|
| 368 |
-
|
| 369 |
task_filter.change(
|
| 370 |
-
fn=
|
| 371 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 372 |
outputs=[leaderboard_output],
|
| 373 |
)
|
| 374 |
framework_filter.change(
|
| 375 |
-
fn=
|
| 376 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 377 |
outputs=[leaderboard_output],
|
| 378 |
)
|
| 379 |
model_type_filter.change(
|
| 380 |
-
fn=
|
| 381 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 382 |
outputs=[leaderboard_output],
|
| 383 |
)
|
| 384 |
precision_filter.change(
|
| 385 |
-
fn=
|
| 386 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 387 |
outputs=[leaderboard_output],
|
| 388 |
)
|
| 389 |
-
|
|
|
|
| 390 |
timer = gr.Timer(60.0)
|
| 391 |
timer.tick(
|
| 392 |
-
fn=
|
| 393 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 394 |
outputs=[leaderboard_output],
|
| 395 |
)
|
|
|
|
| 193 |
lower_selected = [x.lower() for x in selected_tasks]
|
| 194 |
df = df[df["Dataset"].astype(str).str.lower().isin(lower_selected)]
|
| 195 |
|
|
|
|
| 196 |
# Inference framework filter (Method)
|
| 197 |
if selected_frameworks is not None:
|
| 198 |
lower_selected = [str(x).lower() for x in selected_frameworks]
|
|
|
|
| 215 |
df = df.fillna("-")
|
| 216 |
raw_models = set()
|
| 217 |
|
|
|
|
| 218 |
for cell in df["Model"].tolist():
|
| 219 |
if isinstance(cell, str) and "href" in cell:
|
| 220 |
try:
|
|
|
|
| 234 |
links.append(str(name))
|
| 235 |
models_str = ", ".join(links)
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
table_html = df.to_html(escape=False, index=False, classes="metrics-table")
|
| 238 |
return table_html
|
| 239 |
|
|
|
|
| 258 |
# Gradio UI
|
| 259 |
|
| 260 |
def build_app() -> gr.Blocks:
|
| 261 |
+
# Enhanced CSS with better layout and scrollable table
|
| 262 |
+
custom_css = """
|
| 263 |
+
/* Global container styling */
|
| 264 |
+
.gradio-container {
|
| 265 |
+
max-width: 100% !important;
|
| 266 |
+
padding: 0 !important;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
/* Header styling */
|
| 270 |
+
.header-container {
|
| 271 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 272 |
+
padding: 1.5rem 2rem;
|
| 273 |
+
margin: 0;
|
| 274 |
+
border-radius: 0;
|
| 275 |
+
color: white;
|
| 276 |
+
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
.header-container h1 {
|
| 280 |
+
color: white !important;
|
| 281 |
+
margin: 0;
|
| 282 |
+
font-size: 2rem;
|
| 283 |
+
font-weight: 600;
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
.header-subtitle {
|
| 287 |
+
color: rgba(255,255,255,0.9);
|
| 288 |
+
margin-top: 0.5rem;
|
| 289 |
+
font-size: 0.95rem;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
/* Main content area */
|
| 293 |
+
.main-content {
|
| 294 |
+
display: flex;
|
| 295 |
+
height: calc(100vh - 120px);
|
| 296 |
+
gap: 1rem;
|
| 297 |
+
padding: 1rem;
|
| 298 |
+
background: #f8f9fa;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
/* Sidebar styling */
|
| 302 |
+
.sidebar-container {
|
| 303 |
+
background: white;
|
| 304 |
+
border-radius: 8px;
|
| 305 |
+
padding: 1.5rem;
|
| 306 |
+
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
|
| 307 |
+
overflow-y: auto;
|
| 308 |
+
max-height: 100%;
|
| 309 |
+
width: 350px;
|
| 310 |
+
flex-shrink: 0;
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
.sidebar-section {
|
| 314 |
+
margin-bottom: 1.5rem;
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
.sidebar-section h3 {
|
| 318 |
+
font-size: 1.1rem;
|
| 319 |
+
font-weight: 600;
|
| 320 |
+
color: #2d3748;
|
| 321 |
+
margin-bottom: 0.75rem;
|
| 322 |
+
padding-bottom: 0.5rem;
|
| 323 |
+
border-bottom: 2px solid #e2e8f0;
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
/* Filter styling */
|
| 327 |
+
.filter-group {
|
| 328 |
+
background: #f7fafc;
|
| 329 |
+
border-radius: 6px;
|
| 330 |
+
padding: 0.75rem;
|
| 331 |
+
margin-bottom: 1rem;
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
.filter-group label {
|
| 335 |
+
font-weight: 500;
|
| 336 |
+
color: #4a5568;
|
| 337 |
+
font-size: 0.9rem;
|
| 338 |
+
margin-bottom: 0.5rem;
|
| 339 |
+
display: block;
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
/* Table container */
|
| 343 |
+
.table-container {
|
| 344 |
+
flex: 1;
|
| 345 |
+
background: white;
|
| 346 |
+
border-radius: 8px;
|
| 347 |
+
padding: 1.5rem;
|
| 348 |
+
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
|
| 349 |
+
display: flex;
|
| 350 |
+
flex-direction: column;
|
| 351 |
+
min-width: 0;
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
/* Stats bar */
|
| 355 |
+
.stats-bar {
|
| 356 |
+
display: flex;
|
| 357 |
+
gap: 2rem;
|
| 358 |
+
padding: 1rem;
|
| 359 |
+
background: #f7fafc;
|
| 360 |
+
border-radius: 6px;
|
| 361 |
+
margin-bottom: 1rem;
|
| 362 |
+
align-items: center;
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
.stat-item {
|
| 366 |
+
display: flex;
|
| 367 |
+
flex-direction: column;
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
.stat-label {
|
| 371 |
+
font-size: 0.8rem;
|
| 372 |
+
color: #718096;
|
| 373 |
+
text-transform: uppercase;
|
| 374 |
+
letter-spacing: 0.05em;
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
.stat-value {
|
| 378 |
+
font-size: 1.5rem;
|
| 379 |
+
font-weight: 600;
|
| 380 |
+
color: #2d3748;
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
/* Scrollable table wrapper */
|
| 384 |
+
.table-wrapper {
|
| 385 |
+
flex: 1;
|
| 386 |
+
overflow: auto;
|
| 387 |
+
border: 1px solid #e2e8f0;
|
| 388 |
+
border-radius: 6px;
|
| 389 |
+
max-height: calc(100vh - 280px);
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
/* Table styling */
|
| 393 |
+
table.metrics-table {
|
| 394 |
width: 100%;
|
| 395 |
+
border-collapse: separate;
|
| 396 |
+
border-spacing: 0;
|
| 397 |
+
font-size: 0.9rem;
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
table.metrics-table thead {
|
| 401 |
+
position: sticky;
|
| 402 |
+
top: 0;
|
| 403 |
+
background: linear-gradient(to bottom, #f7fafc, #edf2f7);
|
| 404 |
+
z-index: 10;
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
table.metrics-table th {
|
| 408 |
+
padding: 0.75rem;
|
| 409 |
+
text-align: left;
|
| 410 |
+
font-weight: 600;
|
| 411 |
+
color: #2d3748;
|
| 412 |
+
border-bottom: 2px solid #cbd5e0;
|
| 413 |
+
white-space: nowrap;
|
| 414 |
+
font-size: 0.85rem;
|
| 415 |
+
text-transform: uppercase;
|
| 416 |
+
letter-spacing: 0.05em;
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
table.metrics-table td {
|
| 420 |
+
padding: 0.75rem;
|
| 421 |
+
border-bottom: 1px solid #e2e8f0;
|
| 422 |
+
color: #4a5568;
|
| 423 |
+
}
|
| 424 |
+
|
| 425 |
+
table.metrics-table tbody tr:hover {
|
| 426 |
+
background-color: #f7fafc;
|
| 427 |
+
transition: background-color 0.2s;
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
table.metrics-table tbody tr:last-child td {
|
| 431 |
+
border-bottom: none;
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
/* Model links */
|
| 435 |
+
table.metrics-table a {
|
| 436 |
+
color: #4c6ef5;
|
| 437 |
+
text-decoration: none;
|
| 438 |
+
font-weight: 500;
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
table.metrics-table a:hover {
|
| 442 |
+
text-decoration: underline;
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
/* Empty state */
|
| 446 |
+
.empty-state {
|
| 447 |
+
display: flex;
|
| 448 |
+
flex-direction: column;
|
| 449 |
+
align-items: center;
|
| 450 |
+
justify-content: center;
|
| 451 |
+
height: 400px;
|
| 452 |
+
color: #718096;
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
.empty-state p {
|
| 456 |
+
font-size: 1.1rem;
|
| 457 |
+
margin-top: 1rem;
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
+
/* Responsive adjustments */
|
| 461 |
+
@media (max-width: 1024px) {
|
| 462 |
+
.main-content {
|
| 463 |
+
flex-direction: column;
|
| 464 |
+
height: auto;
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
.sidebar-container {
|
| 468 |
+
width: 100%;
|
| 469 |
+
max-height: none;
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
.table-wrapper {
|
| 473 |
+
max-height: 500px;
|
| 474 |
+
}
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
/* Checkbox group styling */
|
| 478 |
+
.gradio-checkbox-group {
|
| 479 |
+
display: flex;
|
| 480 |
+
flex-direction: column;
|
| 481 |
+
gap: 0.5rem;
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
.gradio-checkbox-group label {
|
| 485 |
+
display: flex;
|
| 486 |
+
align-items: center;
|
| 487 |
+
padding: 0.25rem;
|
| 488 |
+
border-radius: 4px;
|
| 489 |
+
transition: background-color 0.2s;
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
.gradio-checkbox-group label:hover {
|
| 493 |
+
background-color: #edf2f7;
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
/* Loading indicator */
|
| 497 |
+
.loading-indicator {
|
| 498 |
+
display: flex;
|
| 499 |
+
align-items: center;
|
| 500 |
+
justify-content: center;
|
| 501 |
+
padding: 2rem;
|
| 502 |
+
color: #718096;
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
/* Hide Gradio footer */
|
| 506 |
+
footer {
|
| 507 |
+
display: none !important;
|
| 508 |
}
|
| 509 |
"""
|
| 510 |
|
| 511 |
+
with gr.Blocks(title="MoE-CAP Dashboard", css=custom_css) as demo:
|
| 512 |
+
# Header
|
| 513 |
+
with gr.Row(elem_classes="header-container"):
|
| 514 |
+
with gr.Column():
|
|
|
|
| 515 |
gr.Markdown(
|
| 516 |
+
"""# π MoE-CAP Dashboard
|
| 517 |
+
<div class="header-subtitle">Comprehensive Model Performance Metrics and Benchmarks</div>
|
| 518 |
+
""",
|
| 519 |
+
elem_classes="header-title"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
)
|
| 521 |
+
|
| 522 |
+
# Main content area
|
| 523 |
+
with gr.Row(elem_classes="main-content"):
|
| 524 |
+
# Sidebar
|
| 525 |
+
with gr.Column(scale=1, elem_classes="sidebar-container"):
|
| 526 |
+
# Filters section
|
| 527 |
+
with gr.Group(elem_classes="sidebar-section"):
|
| 528 |
+
gr.Markdown("### π― Filters", elem_classes="filter-header")
|
| 529 |
+
|
| 530 |
+
dir_path = gr.State(RESULT_DIR)
|
| 531 |
+
|
| 532 |
+
# Task filter
|
| 533 |
+
with gr.Group(elem_classes="filter-group"):
|
| 534 |
+
task_filter = gr.CheckboxGroup(
|
| 535 |
+
label="π Tasks",
|
| 536 |
+
choices=[
|
| 537 |
+
("GSM8K", "gsm8k"),
|
| 538 |
+
("LongBench", "longbench"),
|
| 539 |
+
("MMLU", "mmlu"),
|
| 540 |
+
("NuminaMath", "numinamath"),
|
| 541 |
+
("RULER", "ruler")
|
| 542 |
+
],
|
| 543 |
+
value=["gsm8k", "longbench", "mmlu", "numinamath", "ruler"]
|
| 544 |
+
)
|
| 545 |
+
|
| 546 |
+
# Framework filter
|
| 547 |
+
with gr.Group(elem_classes="filter-group"):
|
| 548 |
+
framework_filter = gr.CheckboxGroup(
|
| 549 |
+
label="βοΈ Inference Frameworks",
|
| 550 |
+
choices=["sglang", "vllm"],
|
| 551 |
+
value=["sglang", "vllm"],
|
| 552 |
+
)
|
| 553 |
+
|
| 554 |
+
# Model type filter
|
| 555 |
+
with gr.Group(elem_classes="filter-group"):
|
| 556 |
+
model_type_filter = gr.CheckboxGroup(
|
| 557 |
+
label="π€ Model Types",
|
| 558 |
+
choices=["instruct", "thinking"],
|
| 559 |
+
value=["instruct", "thinking"],
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
# Precision filter
|
| 563 |
+
with gr.Group(elem_classes="filter-group"):
|
| 564 |
+
precision_filter = gr.CheckboxGroup(
|
| 565 |
+
label="ποΈ Precision",
|
| 566 |
+
choices=["bfloat16", "fp8"],
|
| 567 |
+
value=["bfloat16", "fp8"],
|
| 568 |
+
)
|
| 569 |
|
| 570 |
+
# Information section
|
| 571 |
+
with gr.Group(elem_classes="sidebar-section"):
|
| 572 |
+
gr.Markdown("### π About")
|
| 573 |
+
gr.Markdown(
|
| 574 |
+
"""
|
| 575 |
+
**Benchmarks:**
|
| 576 |
+
- [GSM8K](https://arxiv.org/abs/2110.14168) - Math Problem-Solving
|
| 577 |
+
- [LongBench](https://arxiv.org/abs/2412.15204) - Long-Context Understanding
|
| 578 |
+
- [MMLU](https://arxiv.org/abs/2009.03300) - Multitask Understanding
|
| 579 |
+
- [NuminaMath](http://faculty.bicmr.pku.edu.cn/~dongbin/Publications/numina_dataset.pdf) - Mathematical Reasoning
|
| 580 |
+
- [RULER](https://arxiv.org/abs/2404.06654) - Extreme Long-Context
|
| 581 |
+
|
| 582 |
+
**Key Metrics:**
|
| 583 |
+
- E2E Latency, Throughput, Accuracy
|
| 584 |
+
- S-MBU/S-MFU Performance
|
| 585 |
+
- TTFT/TPOT Timing
|
| 586 |
+
""",
|
| 587 |
+
elem_classes="info-text"
|
| 588 |
+
)
|
| 589 |
+
|
| 590 |
+
# Table area
|
| 591 |
+
with gr.Column(scale=3, elem_classes="table-container"):
|
| 592 |
+
# Stats summary (optional - you can populate this with actual stats)
|
| 593 |
+
with gr.Row(elem_classes="stats-bar", visible=False):
|
| 594 |
+
with gr.Column(elem_classes="stat-item"):
|
| 595 |
+
gr.Markdown('<div class="stat-label">Total Models</div><div class="stat-value">0</div>')
|
| 596 |
+
with gr.Column(elem_classes="stat-item"):
|
| 597 |
+
gr.Markdown('<div class="stat-label">Avg Accuracy</div><div class="stat-value">0%</div>')
|
| 598 |
+
with gr.Column(elem_classes="stat-item"):
|
| 599 |
+
gr.Markdown('<div class="stat-label">Best E2E</div><div class="stat-value">0s</div>')
|
| 600 |
|
| 601 |
+
# Scrollable table
|
| 602 |
+
with gr.Row():
|
| 603 |
+
with gr.Column():
|
| 604 |
+
gr.Markdown("### π Performance Metrics")
|
| 605 |
+
leaderboard_output = gr.HTML(
|
| 606 |
+
label="Metrics Table",
|
| 607 |
+
elem_classes="table-wrapper"
|
| 608 |
+
)
|
| 609 |
+
|
| 610 |
+
# Wrap table HTML in scrollable div
|
| 611 |
+
def wrap_table_html(html):
|
| 612 |
+
if html and "table" in html:
|
| 613 |
+
return f'<div class="table-wrapper">{html}</div>'
|
| 614 |
+
return html
|
| 615 |
+
|
| 616 |
+
# Modified load function to wrap table
|
| 617 |
+
def load_from_dir_wrapped(*args, **kwargs):
|
| 618 |
+
result = load_from_dir(*args, **kwargs)
|
| 619 |
+
return wrap_table_html(result)
|
| 620 |
+
|
| 621 |
+
def auto_refresh_from_dir_wrapped(*args, **kwargs):
|
| 622 |
+
result = auto_refresh_from_dir(*args, **kwargs)
|
| 623 |
+
return wrap_table_html(result)
|
| 624 |
+
|
| 625 |
+
# Load initial data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 626 |
demo.load(
|
| 627 |
+
fn=auto_refresh_from_dir_wrapped,
|
| 628 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 629 |
outputs=[leaderboard_output],
|
| 630 |
)
|
| 631 |
+
|
| 632 |
+
# Filter change handlers
|
| 633 |
task_filter.change(
|
| 634 |
+
fn=load_from_dir_wrapped,
|
| 635 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 636 |
outputs=[leaderboard_output],
|
| 637 |
)
|
| 638 |
framework_filter.change(
|
| 639 |
+
fn=load_from_dir_wrapped,
|
| 640 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 641 |
outputs=[leaderboard_output],
|
| 642 |
)
|
| 643 |
model_type_filter.change(
|
| 644 |
+
fn=load_from_dir_wrapped,
|
| 645 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 646 |
outputs=[leaderboard_output],
|
| 647 |
)
|
| 648 |
precision_filter.change(
|
| 649 |
+
fn=load_from_dir_wrapped,
|
| 650 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 651 |
outputs=[leaderboard_output],
|
| 652 |
)
|
| 653 |
+
|
| 654 |
+
# Auto-refresh timer
|
| 655 |
timer = gr.Timer(60.0)
|
| 656 |
timer.tick(
|
| 657 |
+
fn=auto_refresh_from_dir_wrapped,
|
| 658 |
inputs=[dir_path, task_filter, framework_filter, model_type_filter, precision_filter],
|
| 659 |
outputs=[leaderboard_output],
|
| 660 |
)
|