new regressions panel
Browse files- app.py +55 -4
- data.py +121 -0
- styles.css +48 -0
app.py
CHANGED
|
@@ -4,7 +4,7 @@ import pandas as pd
|
|
| 4 |
import gradio as gr
|
| 5 |
from gradio_toggle import Toggle
|
| 6 |
|
| 7 |
-
from data import CIResults
|
| 8 |
from utils import logger
|
| 9 |
from summary_page import create_summary_page
|
| 10 |
from model_page import plot_model_stats
|
|
@@ -107,6 +107,46 @@ def get_description_text():
|
|
| 107 |
msg.append("*(loading...)*")
|
| 108 |
return "<br>".join(msg)
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
# Load CSS from external file
|
| 111 |
def load_css():
|
| 112 |
try:
|
|
@@ -266,6 +306,13 @@ with gr.Blocks(title="Model Test Results Dashboard", css=load_css(), js=js_func)
|
|
| 266 |
|
| 267 |
# Main content area
|
| 268 |
with gr.Column(scale=4, elem_classes=["main-content"]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
# Current view components
|
| 270 |
with gr.Column(visible=True, elem_classes=["current-view"]) as current_view:
|
| 271 |
# Summary display (default view)
|
|
@@ -853,10 +900,14 @@ with gr.Blocks(title="Model Test Results Dashboard", css=load_css(), js=js_func)
|
|
| 853 |
],
|
| 854 |
)
|
| 855 |
|
| 856 |
-
# Auto-update CI links when the interface loads
|
|
|
|
|
|
|
|
|
|
|
|
|
| 857 |
demo.load(
|
| 858 |
-
fn=
|
| 859 |
-
outputs=[ci_links_display]
|
| 860 |
)
|
| 861 |
|
| 862 |
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
from gradio_toggle import Toggle
|
| 6 |
|
| 7 |
+
from data import CIResults, find_new_regressions
|
| 8 |
from utils import logger
|
| 9 |
from summary_page import create_summary_page
|
| 10 |
from model_page import plot_model_stats
|
|
|
|
| 107 |
msg.append("*(loading...)*")
|
| 108 |
return "<br>".join(msg)
|
| 109 |
|
| 110 |
+
# Function to format new regressions for display
|
| 111 |
+
def get_regressions_text():
|
| 112 |
+
"""Get formatted text for new regressions panel."""
|
| 113 |
+
try:
|
| 114 |
+
regressions = find_new_regressions(Ci_results.df, Ci_results.all_historical_data)
|
| 115 |
+
|
| 116 |
+
if not regressions:
|
| 117 |
+
return "### 🎉 No New Regressions\nAll failures were present in the previous run."
|
| 118 |
+
|
| 119 |
+
# Group by model and device
|
| 120 |
+
grouped = {}
|
| 121 |
+
for reg in regressions:
|
| 122 |
+
model = reg['model']
|
| 123 |
+
device = reg['device'].upper()
|
| 124 |
+
gpu_type = reg['gpu_type']
|
| 125 |
+
test = reg['test']
|
| 126 |
+
|
| 127 |
+
key = f"{model} ({device} {gpu_type})"
|
| 128 |
+
if key not in grouped:
|
| 129 |
+
grouped[key] = []
|
| 130 |
+
grouped[key].append(test)
|
| 131 |
+
|
| 132 |
+
# Format output
|
| 133 |
+
lines = [f"### ⚠️ New Regressions Detected: {len(regressions)} failure(s)"]
|
| 134 |
+
lines.append("")
|
| 135 |
+
|
| 136 |
+
for key in sorted(grouped.keys()):
|
| 137 |
+
tests = grouped[key]
|
| 138 |
+
lines.append(f"**{key}:**")
|
| 139 |
+
for test in tests[:5]: # Limit to 5 tests per model
|
| 140 |
+
lines.append(f" • {test}")
|
| 141 |
+
if len(tests) > 5:
|
| 142 |
+
lines.append(f" • ... and {len(tests) - 5} more")
|
| 143 |
+
lines.append("")
|
| 144 |
+
|
| 145 |
+
return "\n".join(lines)
|
| 146 |
+
except Exception as e:
|
| 147 |
+
logger.error(f"Error getting regressions: {e}")
|
| 148 |
+
return "### ⚠️ New Regressions\n*Unable to load regression data*"
|
| 149 |
+
|
| 150 |
# Load CSS from external file
|
| 151 |
def load_css():
|
| 152 |
try:
|
|
|
|
| 306 |
|
| 307 |
# Main content area
|
| 308 |
with gr.Column(scale=4, elem_classes=["main-content"]):
|
| 309 |
+
# New Regressions Panel (at the top for visibility)
|
| 310 |
+
regressions_panel = gr.Markdown(
|
| 311 |
+
value=get_regressions_text(),
|
| 312 |
+
elem_classes=["regressions-panel"],
|
| 313 |
+
visible=True
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
# Current view components
|
| 317 |
with gr.Column(visible=True, elem_classes=["current-view"]) as current_view:
|
| 318 |
# Summary display (default view)
|
|
|
|
| 900 |
],
|
| 901 |
)
|
| 902 |
|
| 903 |
+
# Auto-update CI links and regressions when the interface loads
|
| 904 |
+
def load_dashboard_data():
|
| 905 |
+
"""Load both CI links and regressions data."""
|
| 906 |
+
return get_ci_links(), get_regressions_text()
|
| 907 |
+
|
| 908 |
demo.load(
|
| 909 |
+
fn=load_dashboard_data,
|
| 910 |
+
outputs=[ci_links_display, regressions_panel]
|
| 911 |
)
|
| 912 |
|
| 913 |
|
data.py
CHANGED
|
@@ -431,6 +431,127 @@ def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_n
|
|
| 431 |
return None
|
| 432 |
|
| 433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
def extract_model_data(row: pd.Series) -> tuple[dict[str, int], dict[str, int], int, int, int, int]:
|
| 435 |
"""Extract and process model data from DataFrame row."""
|
| 436 |
# Handle missing values and get counts directly from dataframe
|
|
|
|
| 431 |
return None
|
| 432 |
|
| 433 |
|
| 434 |
+
def find_new_regressions(current_df: pd.DataFrame, historical_df: pd.DataFrame) -> list[dict]:
|
| 435 |
+
"""
|
| 436 |
+
Compare current failures against the previous day's failures to find new regressions.
|
| 437 |
+
|
| 438 |
+
Returns a list of dicts with:
|
| 439 |
+
- model: model name
|
| 440 |
+
- test: test name
|
| 441 |
+
- device: 'amd' or 'nvidia'
|
| 442 |
+
- gpu_type: 'single' or 'multi'
|
| 443 |
+
"""
|
| 444 |
+
if current_df.empty or historical_df.empty:
|
| 445 |
+
return []
|
| 446 |
+
|
| 447 |
+
new_regressions = []
|
| 448 |
+
|
| 449 |
+
# Get the two most recent dates
|
| 450 |
+
available_dates = sorted(historical_df['date'].unique(), reverse=True)
|
| 451 |
+
if len(available_dates) < 2:
|
| 452 |
+
# Not enough history to compare
|
| 453 |
+
return []
|
| 454 |
+
|
| 455 |
+
today_date = available_dates[0]
|
| 456 |
+
yesterday_date = available_dates[1]
|
| 457 |
+
|
| 458 |
+
# Get data for both dates
|
| 459 |
+
today_data = historical_df[historical_df['date'] == today_date]
|
| 460 |
+
yesterday_data = historical_df[historical_df['date'] == yesterday_date]
|
| 461 |
+
|
| 462 |
+
# For each model, compare failures
|
| 463 |
+
for model_name in current_df.index:
|
| 464 |
+
model_name_lower = model_name.lower()
|
| 465 |
+
|
| 466 |
+
# Get today's failures
|
| 467 |
+
today_row = today_data[today_data.index == model_name_lower]
|
| 468 |
+
if today_row.empty:
|
| 469 |
+
continue
|
| 470 |
+
today_row = today_row.iloc[0]
|
| 471 |
+
|
| 472 |
+
# Get yesterday's failures (if exists)
|
| 473 |
+
yesterday_row = yesterday_data[yesterday_data.index == model_name_lower]
|
| 474 |
+
yesterday_failures_amd = {}
|
| 475 |
+
yesterday_failures_nvidia = {}
|
| 476 |
+
|
| 477 |
+
if not yesterday_row.empty:
|
| 478 |
+
yesterday_row = yesterday_row.iloc[0]
|
| 479 |
+
yesterday_failures_amd = yesterday_row.get('failures_amd', {})
|
| 480 |
+
yesterday_failures_nvidia = yesterday_row.get('failures_nvidia', {})
|
| 481 |
+
|
| 482 |
+
# Handle string/dict conversion
|
| 483 |
+
if isinstance(yesterday_failures_amd, str):
|
| 484 |
+
try:
|
| 485 |
+
yesterday_failures_amd = json.loads(yesterday_failures_amd)
|
| 486 |
+
except:
|
| 487 |
+
yesterday_failures_amd = {}
|
| 488 |
+
if isinstance(yesterday_failures_nvidia, str):
|
| 489 |
+
try:
|
| 490 |
+
yesterday_failures_nvidia = json.loads(yesterday_failures_nvidia)
|
| 491 |
+
except:
|
| 492 |
+
yesterday_failures_nvidia = {}
|
| 493 |
+
|
| 494 |
+
# Get today's failures
|
| 495 |
+
today_failures_amd = today_row.get('failures_amd', {})
|
| 496 |
+
today_failures_nvidia = today_row.get('failures_nvidia', {})
|
| 497 |
+
|
| 498 |
+
# Handle string/dict conversion
|
| 499 |
+
if isinstance(today_failures_amd, str):
|
| 500 |
+
try:
|
| 501 |
+
today_failures_amd = json.loads(today_failures_amd)
|
| 502 |
+
except:
|
| 503 |
+
today_failures_amd = {}
|
| 504 |
+
if isinstance(today_failures_nvidia, str):
|
| 505 |
+
try:
|
| 506 |
+
today_failures_nvidia = json.loads(today_failures_nvidia)
|
| 507 |
+
except:
|
| 508 |
+
today_failures_nvidia = {}
|
| 509 |
+
|
| 510 |
+
# Check AMD failures
|
| 511 |
+
for gpu_type in ['single', 'multi']:
|
| 512 |
+
today_tests = today_failures_amd.get(gpu_type, [])
|
| 513 |
+
yesterday_tests = yesterday_failures_amd.get(gpu_type, [])
|
| 514 |
+
|
| 515 |
+
# Get test names
|
| 516 |
+
today_test_names = {test.get('line', '') for test in today_tests}
|
| 517 |
+
yesterday_test_names = {test.get('line', '') for test in yesterday_tests}
|
| 518 |
+
|
| 519 |
+
# Find new failures
|
| 520 |
+
new_tests = today_test_names - yesterday_test_names
|
| 521 |
+
for test_name in new_tests:
|
| 522 |
+
if test_name: # Skip empty names
|
| 523 |
+
new_regressions.append({
|
| 524 |
+
'model': model_name,
|
| 525 |
+
'test': test_name.split('::')[-1], # Short name
|
| 526 |
+
'test_full': test_name, # Full name
|
| 527 |
+
'device': 'amd',
|
| 528 |
+
'gpu_type': gpu_type
|
| 529 |
+
})
|
| 530 |
+
|
| 531 |
+
# Check NVIDIA failures
|
| 532 |
+
for gpu_type in ['single', 'multi']:
|
| 533 |
+
today_tests = today_failures_nvidia.get(gpu_type, [])
|
| 534 |
+
yesterday_tests = yesterday_failures_nvidia.get(gpu_type, [])
|
| 535 |
+
|
| 536 |
+
# Get test names
|
| 537 |
+
today_test_names = {test.get('line', '') for test in today_tests}
|
| 538 |
+
yesterday_test_names = {test.get('line', '') for test in yesterday_tests}
|
| 539 |
+
|
| 540 |
+
# Find new failures
|
| 541 |
+
new_tests = today_test_names - yesterday_test_names
|
| 542 |
+
for test_name in new_tests:
|
| 543 |
+
if test_name: # Skip empty names
|
| 544 |
+
new_regressions.append({
|
| 545 |
+
'model': model_name,
|
| 546 |
+
'test': test_name.split('::')[-1], # Short name
|
| 547 |
+
'test_full': test_name, # Full name
|
| 548 |
+
'device': 'nvidia',
|
| 549 |
+
'gpu_type': gpu_type
|
| 550 |
+
})
|
| 551 |
+
|
| 552 |
+
return new_regressions
|
| 553 |
+
|
| 554 |
+
|
| 555 |
def extract_model_data(row: pd.Series) -> tuple[dict[str, int], dict[str, int], int, int, int, int]:
|
| 556 |
"""Extract and process model data from DataFrame row."""
|
| 557 |
# Handle missing values and get counts directly from dataframe
|
styles.css
CHANGED
|
@@ -594,6 +594,54 @@ h1, h2, h3, p, .markdown {
|
|
| 594 |
flex-direction: column !important;
|
| 595 |
}
|
| 596 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
/* Custom scrollbar for main content */
|
| 598 |
.main-content {
|
| 599 |
scrollbar-width: thin !important;
|
|
|
|
| 594 |
flex-direction: column !important;
|
| 595 |
}
|
| 596 |
|
| 597 |
+
/* New Regressions Panel */
|
| 598 |
+
.regressions-panel {
|
| 599 |
+
background: linear-gradient(145deg, #2a1a1a, #1a0f0f) !important;
|
| 600 |
+
border: 2px solid #8B4513 !important;
|
| 601 |
+
border-radius: 8px !important;
|
| 602 |
+
padding: 15px 20px !important;
|
| 603 |
+
margin: 15px 0px !important;
|
| 604 |
+
box-shadow: 0 4px 12px rgba(255, 107, 107, 0.2) !important;
|
| 605 |
+
animation: pulse-border 2s ease-in-out infinite !important;
|
| 606 |
+
}
|
| 607 |
+
|
| 608 |
+
.regressions-panel h3 {
|
| 609 |
+
color: #FFB86C !important;
|
| 610 |
+
font-family: monospace !important;
|
| 611 |
+
font-size: 16px !important;
|
| 612 |
+
font-weight: bold !important;
|
| 613 |
+
margin: 0 0 10px 0 !important;
|
| 614 |
+
display: flex !important;
|
| 615 |
+
align-items: center !important;
|
| 616 |
+
}
|
| 617 |
+
|
| 618 |
+
.regressions-panel p,
|
| 619 |
+
.regressions-panel ul,
|
| 620 |
+
.regressions-panel li {
|
| 621 |
+
color: #FFFFFF !important;
|
| 622 |
+
font-family: monospace !important;
|
| 623 |
+
font-size: 13px !important;
|
| 624 |
+
line-height: 1.6 !important;
|
| 625 |
+
margin: 4px 0 !important;
|
| 626 |
+
}
|
| 627 |
+
|
| 628 |
+
.regressions-panel strong {
|
| 629 |
+
color: #FF6B6B !important;
|
| 630 |
+
font-weight: 600 !important;
|
| 631 |
+
}
|
| 632 |
+
|
| 633 |
+
/* Pulse animation for new regressions */
|
| 634 |
+
@keyframes pulse-border {
|
| 635 |
+
0%, 100% {
|
| 636 |
+
border-color: #8B4513;
|
| 637 |
+
box-shadow: 0 4px 12px rgba(255, 107, 107, 0.2);
|
| 638 |
+
}
|
| 639 |
+
50% {
|
| 640 |
+
border-color: #B8621B;
|
| 641 |
+
box-shadow: 0 4px 16px rgba(255, 107, 107, 0.4);
|
| 642 |
+
}
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
/* Custom scrollbar for main content */
|
| 646 |
.main-content {
|
| 647 |
scrollbar-width: thin !important;
|