Mandark-droid commited on
Commit
659d404
Β·
1 Parent(s): 4a44e51

Fix screen navigation: DrillDown to Run Detail switching now works

Browse files

Major fixes to enable proper screen visibility switching:
- Moved event handlers (on_drilldown_select, go_back_to_leaderboard) to module-level before gr.Blocks() - handlers defined inside Blocks context had scoping issues
- Wrapped Sidebar and all screens in main_app_container Column (matching MockTraceMind structure)
- Downgraded from Gradio 6.0.0.dev4 to stable Gradio 5.49.1 - dev version had visibility update bugs causing handlers to fire multiple times and updates not applying
- Added theme configuration to gr.Blocks() (matching MockTraceMind pattern)
- Updated requirements.txt to gradio>=5.0.0

Screen navigation now works correctly:
- Click DrillDown row β†’ leaderboard screen hides, run detail screen shows
- Single click (not multiple clicks required)
- Screens toggle properly instead of stacking

Files changed (3) hide show
  1. README.md +4 -0
  2. app.py +590 -144
  3. requirements.txt +4 -2
README.md CHANGED
@@ -5,12 +5,16 @@ colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.49.1
 
8
  app_file: app.py
 
9
  pinned: false
10
  tags:
11
  - mcp-in-action-track-enterprise
12
  - agent-evaluation
13
  - mcp-client
 
 
14
  ---
15
 
16
  # πŸ” TraceMind-AI
 
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
+
9
  app_file: app.py
10
+ short_description: Enterprise-grade AI agent evaluation platform with MCP-powered intelligence and real-time leaderboards
11
  pinned: false
12
  tags:
13
  - mcp-in-action-track-enterprise
14
  - agent-evaluation
15
  - mcp-client
16
+ - leaderboard
17
+ - gradio
18
  ---
19
 
20
  # πŸ” TraceMind-AI
app.py CHANGED
@@ -21,20 +21,32 @@ from components.analytics_charts import (
21
  create_cost_efficiency_scatter
22
  )
23
  from components.report_cards import generate_leaderboard_summary_card
 
24
 
25
  # Initialize data loader
26
  data_loader = create_data_loader_from_env()
 
27
 
28
- # Global state
29
- leaderboard_df_cache = None
 
 
 
 
 
 
 
 
 
 
30
 
31
 
32
  def load_leaderboard():
33
- """Load initial leaderboard data"""
34
  global leaderboard_df_cache
35
 
36
- df = data_loader.load_leaderboard()
37
- leaderboard_df_cache = df.copy()
38
 
39
  html = generate_leaderboard_html(df)
40
 
@@ -44,6 +56,21 @@ def load_leaderboard():
44
  return html, gr.update(choices=models), gr.update(choices=models)
45
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def apply_filters(model, provider, sort_by_col):
48
  """Apply filters and sorting to leaderboard"""
49
  global leaderboard_df_cache
@@ -65,10 +92,13 @@ def apply_filters(model, provider, sort_by_col):
65
 
66
  def load_drilldown(agent_type, provider):
67
  """Load drilldown data with filters"""
 
 
68
  try:
69
  df = data_loader.load_leaderboard()
70
 
71
  if df.empty:
 
72
  return pd.DataFrame()
73
 
74
  if agent_type != "All" and 'agent_type' in df.columns:
@@ -76,7 +106,11 @@ def load_drilldown(agent_type, provider):
76
  if provider != "All" and 'provider' in df.columns:
77
  df = df[df['provider'] == provider]
78
 
79
- # Select only columns that exist
 
 
 
 
80
  desired_columns = [
81
  'run_id', 'model', 'agent_type', 'provider',
82
  'success_rate', 'total_tests', 'avg_duration_ms', 'total_cost_usd'
@@ -91,6 +125,7 @@ def load_drilldown(agent_type, provider):
91
 
92
  display_df = df[available_columns].copy()
93
 
 
94
  return display_df
95
  except Exception as e:
96
  print(f"[ERROR] load_drilldown: {e}")
@@ -165,8 +200,365 @@ def generate_insights():
165
  return f"## πŸ“Š Leaderboard Summary\n\nError generating insights: {str(e)}"
166
 
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  # Build Gradio app
169
- with gr.Blocks(title="TraceMind-AI") as app:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  # Top Banner
172
  gr.HTML("""
@@ -188,190 +580,244 @@ with gr.Blocks(title="TraceMind-AI") as app:
188
  </div>
189
  """)
190
 
191
- # Sidebar Navigation
192
- with gr.Sidebar():
193
- gr.Markdown("## 🧠 TraceMind")
194
- gr.Markdown("*Navigation & Controls*")
195
-
196
- gr.Markdown("---")
197
-
198
- # Navigation section
199
- gr.Markdown("### 🧭 Navigation")
200
-
201
- # Navigation buttons
202
- leaderboard_nav_btn = gr.Button("πŸ† Leaderboard", variant="primary", size="lg")
203
- compare_nav_btn = gr.Button("βš–οΈ Compare", variant="secondary", size="lg")
204
- docs_nav_btn = gr.Button("πŸ“š Documentation", variant="secondary", size="lg")
205
-
206
- gr.Markdown("---")
207
-
208
- # Filters section
209
- gr.Markdown("### πŸ” Global Filters")
210
-
211
- sidebar_model_filter = gr.Dropdown(
212
- choices=["All Models"],
213
- value="All Models",
214
- label="Model",
215
- info="Filter evaluations by AI model"
216
- )
217
-
218
- sidebar_agent_type_filter = gr.Radio(
219
- choices=["All", "tool", "code", "both"],
220
- value="All",
221
- label="Agent Type",
222
- info="Tool: Function calling | Code: Code execution | Both: Hybrid"
223
- )
224
-
225
- # Main content area
226
- # Screen 1: Main Leaderboard
227
- with gr.Column(visible=True) as leaderboard_screen:
228
- gr.Markdown("## πŸ† Agent Evaluation Leaderboard")
229
- with gr.Tabs():
230
- with gr.TabItem("πŸ† Leaderboard"):
231
- # Filters
232
- with gr.Row():
233
- model_filter = gr.Dropdown(
234
- choices=["All Models"],
235
- value="All Models",
236
- label="Filter by Model"
237
- )
238
- provider_filter = gr.Dropdown(
239
- choices=["All", "litellm", "transformers"],
240
- value="All",
241
- label="Provider"
242
- )
243
- sort_by = gr.Dropdown(
244
- choices=["success_rate", "total_cost_usd", "avg_duration_ms"],
245
- value="success_rate",
246
- label="Sort By"
247
- )
248
-
249
- apply_filters_btn = gr.Button("πŸ” Apply Filters")
250
-
251
- # HTML table
252
- leaderboard_by_model = gr.HTML()
253
-
254
- with gr.TabItem("πŸ“‹ DrillDown"):
255
- with gr.Row():
256
- drilldown_agent_type = gr.Radio(
257
- choices=["All", "tool", "code", "both"],
258
- value="All",
259
- label="Agent Type"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  )
261
- drilldown_provider = gr.Dropdown(
262
- choices=["All", "litellm", "transformers"],
263
- value="All",
264
- label="Provider"
 
 
 
 
 
265
  )
266
-
267
- apply_drilldown_btn = gr.Button("πŸ” Apply")
268
-
269
- leaderboard_table = gr.Dataframe(
270
- headers=["Run ID", "Model", "Agent Type", "Provider", "Success Rate", "Tests", "Duration", "Cost"],
271
- interactive=False
272
- )
273
-
274
- with gr.TabItem("πŸ“ˆ Trends"):
275
- trends_plot = gr.Plot()
276
-
277
- with gr.TabItem("πŸ“Š Analytics"):
278
- viz_type = gr.Radio(
279
- choices=["πŸ”₯ Performance Heatmap", "⚑ Speed vs Accuracy", "πŸ’° Cost Efficiency"],
280
- value="πŸ”₯ Performance Heatmap",
281
- label="Select Visualization"
282
- )
283
- analytics_chart = gr.Plot()
284
-
285
- with gr.TabItem("πŸ“₯ Summary Card"):
286
- top_n_slider = gr.Slider(1, 5, 3, step=1, label="Top N Models")
287
- generate_card_btn = gr.Button("🎨 Generate Card")
288
- card_preview = gr.HTML()
289
-
290
- with gr.TabItem("πŸ€– AI Insights"):
291
- regenerate_btn = gr.Button("πŸ”„ Regenerate")
292
- mcp_insights = gr.Markdown("*Loading insights...*")
293
-
294
- # Hidden textbox for row selection (JavaScript bridge)
295
- selected_row_index = gr.Textbox(visible=False, elem_id="selected_row_index")
296
-
297
- # Event handlers
298
- app.load(
299
  fn=load_leaderboard,
300
  outputs=[leaderboard_by_model, model_filter, sidebar_model_filter]
301
- )
302
 
303
- app.load(
304
  fn=load_trends,
305
  outputs=[trends_plot]
306
- )
307
 
308
- # Load drilldown data on page load
309
- app.load(
310
  fn=load_drilldown,
311
  inputs=[drilldown_agent_type, drilldown_provider],
312
  outputs=[leaderboard_table]
313
- )
314
 
315
- apply_filters_btn.click(
 
 
 
 
 
 
316
  fn=apply_filters,
317
  inputs=[model_filter, provider_filter, sort_by],
318
  outputs=[leaderboard_by_model]
319
- )
320
 
321
- apply_drilldown_btn.click(
322
  fn=load_drilldown,
323
  inputs=[drilldown_agent_type, drilldown_provider],
324
  outputs=[leaderboard_table]
325
- )
326
 
327
- # Sidebar filter handlers
328
- def apply_sidebar_model_filter(model, sort_by_col):
329
- """Apply sidebar model filter to leaderboard"""
330
- return apply_filters(model, "All", sort_by_col), gr.update(value=model)
331
 
332
- sidebar_model_filter.change(
333
  fn=apply_sidebar_model_filter,
334
  inputs=[sidebar_model_filter, sort_by],
335
  outputs=[leaderboard_by_model, model_filter]
336
- )
337
 
338
- def apply_sidebar_agent_type_filter(agent_type):
339
- """Apply sidebar agent type filter to drilldown"""
340
- return load_drilldown(agent_type, "All"), gr.update(value=agent_type)
341
 
342
- sidebar_agent_type_filter.change(
343
  fn=apply_sidebar_agent_type_filter,
344
  inputs=[sidebar_agent_type_filter],
345
  outputs=[leaderboard_table, drilldown_agent_type]
346
- )
347
 
348
- viz_type.change(
349
  fn=update_analytics,
350
  inputs=[viz_type],
351
  outputs=[analytics_chart]
352
- )
353
 
354
- app.load(
355
  fn=update_analytics,
356
  inputs=[viz_type],
357
  outputs=[analytics_chart]
358
- )
359
 
360
- generate_card_btn.click(
361
  fn=generate_card,
362
  inputs=[top_n_slider],
363
  outputs=[card_preview]
364
- )
365
 
366
- app.load(
367
  fn=generate_insights,
368
  outputs=[mcp_insights]
369
- )
370
 
371
- regenerate_btn.click(
372
  fn=generate_insights,
373
  outputs=[mcp_insights]
374
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
 
377
  if __name__ == "__main__":
 
21
  create_cost_efficiency_scatter
22
  )
23
  from components.report_cards import generate_leaderboard_summary_card
24
+ from utils.navigation import Navigator, Screen
25
 
26
  # Initialize data loader
27
  data_loader = create_data_loader_from_env()
28
+ navigator = Navigator()
29
 
30
+ # Pre-load and cache the leaderboard data before building UI
31
+ print("πŸ“₯ Pre-loading leaderboard data from HuggingFace...")
32
+ leaderboard_df_cache = data_loader.load_leaderboard()
33
+ print(f"βœ… Loaded {len(leaderboard_df_cache)} evaluation runs")
34
+
35
+ # Global state (already populated)
36
+ # leaderboard_df_cache is now set
37
+
38
+ # Additional global state for navigation
39
+ current_selected_run = None
40
+ current_selected_trace = None
41
+ current_drilldown_df = None # Store currently displayed drilldown data
42
 
43
 
44
  def load_leaderboard():
45
+ """Load initial leaderboard data from cache"""
46
  global leaderboard_df_cache
47
 
48
+ # Use pre-cached data (already loaded before UI build)
49
+ df = leaderboard_df_cache.copy()
50
 
51
  html = generate_leaderboard_html(df)
52
 
 
56
  return html, gr.update(choices=models), gr.update(choices=models)
57
 
58
 
59
+ def refresh_leaderboard():
60
+ """Refresh leaderboard data from source (for reload button)"""
61
+ global leaderboard_df_cache
62
+
63
+ print("πŸ”„ Refreshing leaderboard data...")
64
+ df = data_loader.refresh_leaderboard() # Clears cache and reloads
65
+ leaderboard_df_cache = df.copy()
66
+ print(f"βœ… Refreshed {len(df)} evaluation runs")
67
+
68
+ html = generate_leaderboard_html(df)
69
+ models = ["All Models"] + sorted(df['model'].unique().tolist())
70
+
71
+ return html, gr.update(choices=models), gr.update(choices=models)
72
+
73
+
74
  def apply_filters(model, provider, sort_by_col):
75
  """Apply filters and sorting to leaderboard"""
76
  global leaderboard_df_cache
 
92
 
93
  def load_drilldown(agent_type, provider):
94
  """Load drilldown data with filters"""
95
+ global current_drilldown_df
96
+
97
  try:
98
  df = data_loader.load_leaderboard()
99
 
100
  if df.empty:
101
+ current_drilldown_df = pd.DataFrame()
102
  return pd.DataFrame()
103
 
104
  if agent_type != "All" and 'agent_type' in df.columns:
 
106
  if provider != "All" and 'provider' in df.columns:
107
  df = df[df['provider'] == provider]
108
 
109
+ # IMPORTANT: Store the FULL dataframe in global state (with ALL columns)
110
+ # This ensures the event handler has access to results_dataset, traces_dataset, etc.
111
+ current_drilldown_df = df.copy()
112
+
113
+ # Select only columns for DISPLAY
114
  desired_columns = [
115
  'run_id', 'model', 'agent_type', 'provider',
116
  'success_rate', 'total_tests', 'avg_duration_ms', 'total_cost_usd'
 
125
 
126
  display_df = df[available_columns].copy()
127
 
128
+ # Return ONLY display columns for the UI table
129
  return display_df
130
  except Exception as e:
131
  print(f"[ERROR] load_drilldown: {e}")
 
200
  return f"## πŸ“Š Leaderboard Summary\n\nError generating insights: {str(e)}"
201
 
202
 
203
+ def on_html_table_row_click(row_index_str):
204
+ """Handle row click from HTML table via JavaScript (hidden textbox bridge)"""
205
+ global current_selected_run, leaderboard_df_cache
206
+
207
+ print(f"[DEBUG] on_html_table_row_click called with: '{row_index_str}'")
208
+
209
+ try:
210
+ # Parse row index from string
211
+ if not row_index_str or row_index_str == "" or row_index_str.strip() == "":
212
+ print("[DEBUG] Empty row index, ignoring")
213
+ return {
214
+ leaderboard_screen: gr.update(),
215
+ run_detail_screen: gr.update(),
216
+ run_metadata_html: gr.update(),
217
+ test_cases_table: gr.update(),
218
+ selected_row_index: gr.update(value="") # Clear textbox
219
+ }
220
+
221
+ selected_idx = int(row_index_str)
222
+ print(f"[DEBUG] Parsed row index: {selected_idx}")
223
+
224
+ # Get the full run data from cache
225
+ if leaderboard_df_cache is None or leaderboard_df_cache.empty:
226
+ print("[ERROR] Leaderboard cache is empty")
227
+ gr.Warning("Leaderboard data not loaded")
228
+ return {
229
+ leaderboard_screen: gr.update(),
230
+ run_detail_screen: gr.update(),
231
+ run_metadata_html: gr.update(),
232
+ test_cases_table: gr.update(),
233
+ selected_row_index: gr.update(value="") # Clear textbox
234
+ }
235
+
236
+ if selected_idx < 0 or selected_idx >= len(leaderboard_df_cache):
237
+ print(f"[ERROR] Invalid row index: {selected_idx}, cache size: {len(leaderboard_df_cache)}")
238
+ gr.Warning(f"Invalid row index: {selected_idx}")
239
+ return {
240
+ leaderboard_screen: gr.update(),
241
+ run_detail_screen: gr.update(),
242
+ run_metadata_html: gr.update(),
243
+ test_cases_table: gr.update(),
244
+ selected_row_index: gr.update(value="") # Clear textbox
245
+ }
246
+
247
+ run_data = leaderboard_df_cache.iloc[selected_idx].to_dict()
248
+
249
+ # Set global
250
+ current_selected_run = run_data
251
+
252
+ print(f"[DEBUG] Selected run from HTML table: {run_data.get('model', 'Unknown')} (row {selected_idx})")
253
+
254
+ # Load results for this run
255
+ results_dataset = run_data.get('results_dataset')
256
+ if not results_dataset:
257
+ gr.Warning("No results dataset found for this run")
258
+ return {
259
+ leaderboard_screen: gr.update(visible=True),
260
+ run_detail_screen: gr.update(visible=False),
261
+ run_metadata_html: gr.update(value="<h3>No results dataset found</h3>"),
262
+ test_cases_table: gr.update(value=pd.DataFrame()),
263
+ selected_row_index: gr.update(value="")
264
+ }
265
+
266
+ results_df = data_loader.load_results(results_dataset)
267
+
268
+ # Create metadata HTML
269
+ metadata_html = f"""
270
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
271
+ padding: 20px; border-radius: 10px; color: white; margin-bottom: 20px;">
272
+ <h2 style="margin: 0 0 10px 0;">πŸ“Š Run Detail: {run_data.get('model', 'Unknown')}</h2>
273
+ <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 20px; margin-top: 15px;">
274
+ <div>
275
+ <strong>Agent Type:</strong> {run_data.get('agent_type', 'N/A')}<br>
276
+ <strong>Provider:</strong> {run_data.get('provider', 'N/A')}<br>
277
+ <strong>Success Rate:</strong> {run_data.get('success_rate', 0):.1f}%
278
+ </div>
279
+ <div>
280
+ <strong>Total Tests:</strong> {run_data.get('total_tests', 0)}<br>
281
+ <strong>Successful:</strong> {run_data.get('successful_tests', 0)}<br>
282
+ <strong>Failed:</strong> {run_data.get('failed_tests', 0)}
283
+ </div>
284
+ <div>
285
+ <strong>Total Cost:</strong> ${run_data.get('total_cost_usd', 0):.4f}<br>
286
+ <strong>Avg Duration:</strong> {run_data.get('avg_duration_ms', 0):.0f}ms<br>
287
+ <strong>Submitted By:</strong> {run_data.get('submitted_by', 'Unknown')}
288
+ </div>
289
+ </div>
290
+ </div>
291
+ """
292
+
293
+ # Format results for display
294
+ display_df = results_df.copy()
295
+
296
+ # Select and format columns if they exist
297
+ display_columns = []
298
+ if 'task_id' in display_df.columns:
299
+ display_columns.append('task_id')
300
+ if 'success' in display_df.columns:
301
+ display_df['success'] = display_df['success'].apply(lambda x: "βœ…" if x else "❌")
302
+ display_columns.append('success')
303
+ if 'tool_called' in display_df.columns:
304
+ display_columns.append('tool_called')
305
+ if 'execution_time_ms' in display_df.columns:
306
+ display_df['execution_time_ms'] = display_df['execution_time_ms'].apply(lambda x: f"{x:.0f}ms")
307
+ display_columns.append('execution_time_ms')
308
+ if 'total_tokens' in display_df.columns:
309
+ display_columns.append('total_tokens')
310
+ if 'cost_usd' in display_df.columns:
311
+ display_df['cost_usd'] = display_df['cost_usd'].apply(lambda x: f"${x:.4f}")
312
+ display_columns.append('cost_usd')
313
+ if 'trace_id' in display_df.columns:
314
+ display_columns.append('trace_id')
315
+
316
+ if display_columns:
317
+ display_df = display_df[display_columns]
318
+
319
+ print(f"[DEBUG] Successfully loaded run detail for: {run_data.get('model', 'Unknown')}")
320
+
321
+ return {
322
+ # Hide leaderboard, show run detail
323
+ leaderboard_screen: gr.update(visible=False),
324
+ run_detail_screen: gr.update(visible=True),
325
+ run_metadata_html: gr.update(value=metadata_html),
326
+ test_cases_table: gr.update(value=display_df),
327
+ selected_row_index: gr.update(value="") # Clear textbox
328
+ }
329
+
330
+ except Exception as e:
331
+ print(f"[ERROR] Handling HTML table row click: {e}")
332
+ import traceback
333
+ traceback.print_exc()
334
+ gr.Warning(f"Error loading run details: {str(e)}")
335
+ return {
336
+ leaderboard_screen: gr.update(visible=True), # Stay on leaderboard
337
+ run_detail_screen: gr.update(visible=False),
338
+ run_metadata_html: gr.update(),
339
+ test_cases_table: gr.update(),
340
+ selected_row_index: gr.update(value="") # Clear textbox
341
+ }
342
+
343
+
344
+ def load_run_detail(run_id):
345
+ """Load run detail data including results dataset"""
346
+ global current_selected_run, leaderboard_df_cache
347
+
348
+ try:
349
+ # Find run in cache
350
+ df = leaderboard_df_cache
351
+ run_data = df[df['run_id'] == run_id].iloc[0].to_dict()
352
+ current_selected_run = run_data
353
+
354
+ # Load results dataset
355
+ results_dataset = run_data.get('results_dataset')
356
+ if not results_dataset:
357
+ return pd.DataFrame(), f"# Error\n\nNo results dataset found for this run", ""
358
+
359
+ results_df = data_loader.load_results(results_dataset)
360
+
361
+ # Create metadata HTML
362
+ metadata_html = f"""
363
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
364
+ padding: 20px; border-radius: 10px; color: white; margin-bottom: 20px;">
365
+ <h2 style="margin: 0 0 10px 0;">πŸ“Š Run Detail: {run_data.get('model', 'Unknown')}</h2>
366
+ <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 20px; margin-top: 15px;">
367
+ <div>
368
+ <strong>Agent Type:</strong> {run_data.get('agent_type', 'N/A')}<br>
369
+ <strong>Provider:</strong> {run_data.get('provider', 'N/A')}<br>
370
+ <strong>Success Rate:</strong> {run_data.get('success_rate', 0):.1f}%
371
+ </div>
372
+ <div>
373
+ <strong>Total Tests:</strong> {run_data.get('total_tests', 0)}<br>
374
+ <strong>Successful:</strong> {run_data.get('successful_tests', 0)}<br>
375
+ <strong>Failed:</strong> {run_data.get('failed_tests', 0)}
376
+ </div>
377
+ <div>
378
+ <strong>Total Cost:</strong> ${run_data.get('total_cost_usd', 0):.4f}<br>
379
+ <strong>Avg Duration:</strong> {run_data.get('avg_duration_ms', 0):.0f}ms<br>
380
+ <strong>Submitted By:</strong> {run_data.get('submitted_by', 'Unknown')}
381
+ </div>
382
+ </div>
383
+ </div>
384
+ """
385
+
386
+ # Format results for display
387
+ display_df = results_df.copy()
388
+
389
+ # Select and format columns if they exist
390
+ display_columns = []
391
+ if 'task_id' in display_df.columns:
392
+ display_columns.append('task_id')
393
+ if 'success' in display_df.columns:
394
+ display_df['success'] = display_df['success'].apply(lambda x: "βœ…" if x else "❌")
395
+ display_columns.append('success')
396
+ if 'tool_called' in display_df.columns:
397
+ display_columns.append('tool_called')
398
+ if 'execution_time_ms' in display_df.columns:
399
+ display_df['execution_time_ms'] = display_df['execution_time_ms'].apply(lambda x: f"{x:.0f}ms")
400
+ display_columns.append('execution_time_ms')
401
+ if 'total_tokens' in display_df.columns:
402
+ display_columns.append('total_tokens')
403
+ if 'cost_usd' in display_df.columns:
404
+ display_df['cost_usd'] = display_df['cost_usd'].apply(lambda x: f"${x:.4f}")
405
+ display_columns.append('cost_usd')
406
+ if 'trace_id' in display_df.columns:
407
+ display_columns.append('trace_id')
408
+
409
+ if display_columns:
410
+ display_df = display_df[display_columns]
411
+
412
+ return display_df, metadata_html, run_data.get('run_id', '')
413
+
414
+ except Exception as e:
415
+ print(f"[ERROR] load_run_detail: {e}")
416
+ import traceback
417
+ traceback.print_exc()
418
+ return pd.DataFrame(), f"# Error\n\nError loading run detail: {str(e)}", ""
419
+
420
+
421
+
422
+ # Screen 3 (Run Detail) event handlers
423
+ def on_drilldown_select(evt: gr.SelectData, df):
424
+ """Handle row selection from DrillDown table - EXACT COPY from MockTraceMind"""
425
+ global current_selected_run, current_drilldown_df
426
+
427
+ try:
428
+ # Get selected run - use currently displayed dataframe (filtered/sorted)
429
+ selected_idx = evt.index[0]
430
+
431
+ # Get the full run data from the displayed dataframe
432
+ # This ensures we get the correct row even after filtering/sorting
433
+ if current_drilldown_df is not None and not current_drilldown_df.empty:
434
+ if selected_idx < len(current_drilldown_df):
435
+ run_data = current_drilldown_df.iloc[selected_idx].to_dict()
436
+ else:
437
+ gr.Warning(f"Invalid row selection: index {selected_idx} out of bounds")
438
+ return {}
439
+ else:
440
+ gr.Warning("Leaderboard data not available")
441
+ return {}
442
+
443
+ # IMPORTANT: Set global FIRST before any operations that might fail
444
+ current_selected_run = run_data
445
+
446
+ print(f"[DEBUG] Selected run: {run_data.get('model', 'Unknown')} (run_id: {run_data.get('run_id', 'N/A')[:8]}...)")
447
+
448
+ # Load results for this run
449
+ results_dataset = run_data.get('results_dataset')
450
+ if not results_dataset:
451
+ gr.Warning("No results dataset found for this run")
452
+ return {
453
+ leaderboard_screen: gr.update(visible=True),
454
+ run_detail_screen: gr.update(visible=False),
455
+ run_metadata_html: gr.update(value="<h3>No results dataset found</h3>"),
456
+ test_cases_table: gr.update(value=pd.DataFrame())
457
+ }
458
+
459
+ results_df = data_loader.load_results(results_dataset)
460
+
461
+ # Create metadata HTML
462
+ metadata_html = f"""
463
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
464
+ padding: 20px; border-radius: 10px; color: white; margin-bottom: 20px;">
465
+ <h2 style="margin: 0 0 10px 0;">πŸ“Š Run Detail: {run_data.get('model', 'Unknown')}</h2>
466
+ <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 20px; margin-top: 15px;">
467
+ <div>
468
+ <strong>Agent Type:</strong> {run_data.get('agent_type', 'N/A')}<br>
469
+ <strong>Provider:</strong> {run_data.get('provider', 'N/A')}<br>
470
+ <strong>Success Rate:</strong> {run_data.get('success_rate', 0):.1f}%
471
+ </div>
472
+ <div>
473
+ <strong>Total Tests:</strong> {run_data.get('total_tests', 0)}<br>
474
+ <strong>Successful:</strong> {run_data.get('successful_tests', 0)}<br>
475
+ <strong>Failed:</strong> {run_data.get('failed_tests', 0)}
476
+ </div>
477
+ <div>
478
+ <strong>Total Cost:</strong> ${run_data.get('total_cost_usd', 0):.4f}<br>
479
+ <strong>Avg Duration:</strong> {run_data.get('avg_duration_ms', 0):.0f}ms<br>
480
+ <strong>Submitted By:</strong> {run_data.get('submitted_by', 'Unknown')}
481
+ </div>
482
+ </div>
483
+ </div>
484
+ """
485
+
486
+ # Format results for display
487
+ display_df = results_df.copy()
488
+
489
+ # Select and format columns if they exist
490
+ display_columns = []
491
+ if 'task_id' in display_df.columns:
492
+ display_columns.append('task_id')
493
+ if 'success' in display_df.columns:
494
+ display_df['success'] = display_df['success'].apply(lambda x: "βœ…" if x else "❌")
495
+ display_columns.append('success')
496
+ if 'tool_called' in display_df.columns:
497
+ display_columns.append('tool_called')
498
+ if 'execution_time_ms' in display_df.columns:
499
+ display_df['execution_time_ms'] = display_df['execution_time_ms'].apply(lambda x: f"{x:.0f}ms")
500
+ display_columns.append('execution_time_ms')
501
+ if 'total_tokens' in display_df.columns:
502
+ display_columns.append('total_tokens')
503
+ if 'cost_usd' in display_df.columns:
504
+ display_df['cost_usd'] = display_df['cost_usd'].apply(lambda x: f"${x:.4f}")
505
+ display_columns.append('cost_usd')
506
+ if 'trace_id' in display_df.columns:
507
+ display_columns.append('trace_id')
508
+
509
+ if display_columns:
510
+ display_df = display_df[display_columns]
511
+
512
+ print(f"[DEBUG] Successfully loaded run detail for: {run_data.get('model', 'Unknown')}")
513
+
514
+ return {
515
+ # Hide leaderboard, show run detail
516
+ leaderboard_screen: gr.update(visible=False),
517
+ run_detail_screen: gr.update(visible=True),
518
+ run_metadata_html: gr.update(value=metadata_html),
519
+ test_cases_table: gr.update(value=display_df)
520
+ }
521
+
522
+ except Exception as e:
523
+ print(f"[ERROR] Loading run details: {e}")
524
+ import traceback
525
+ traceback.print_exc()
526
+ gr.Warning(f"Error loading run details: {e}")
527
+
528
+ # Return updates for all output components to avoid Gradio error
529
+ return {
530
+ leaderboard_screen: gr.update(visible=True), # Stay on leaderboard
531
+ run_detail_screen: gr.update(visible=False),
532
+ run_metadata_html: gr.update(value="<h3>Error loading run detail</h3>"),
533
+ test_cases_table: gr.update(value=pd.DataFrame())
534
+ }
535
+
536
+
537
+
538
+ def go_back_to_leaderboard():
539
+ """Navigate back to leaderboard screen"""
540
+ return {
541
+ leaderboard_screen: gr.update(visible=True),
542
+ run_detail_screen: gr.update(visible=False)
543
+ }
544
+
545
+
546
  # Build Gradio app
547
+ # Theme configuration (like MockTraceMind)
548
+ theme = gr.themes.Base(
549
+ primary_hue="indigo",
550
+ secondary_hue="purple",
551
+ neutral_hue="slate",
552
+ font=gr.themes.GoogleFont("Inter"),
553
+ ).set(
554
+ body_background_fill="*neutral_50",
555
+ body_background_fill_dark="*neutral_900",
556
+ button_primary_background_fill="*primary_500",
557
+ button_primary_background_fill_hover="*primary_600",
558
+ button_primary_text_color="white",
559
+ )
560
+
561
+ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
562
 
563
  # Top Banner
564
  gr.HTML("""
 
580
  </div>
581
  """)
582
 
583
+ # Main app container (wraps Sidebar + all screens like MockTraceMind)
584
+ with gr.Column() as main_app_container:
585
+
586
+
587
+ # Sidebar Navigation
588
+ with gr.Sidebar():
589
+ gr.Markdown("## 🧠 TraceMind")
590
+ gr.Markdown("*Navigation & Controls*")
591
+
592
+ gr.Markdown("---")
593
+
594
+ # Navigation section
595
+ gr.Markdown("### 🧭 Navigation")
596
+
597
+ # Navigation buttons
598
+ leaderboard_nav_btn = gr.Button("πŸ† Leaderboard", variant="primary", size="lg")
599
+ compare_nav_btn = gr.Button("βš–οΈ Compare", variant="secondary", size="lg")
600
+ docs_nav_btn = gr.Button("πŸ“š Documentation", variant="secondary", size="lg")
601
+
602
+ gr.Markdown("---")
603
+
604
+ # Data Controls
605
+ gr.Markdown("### πŸ”„ Data Controls")
606
+ refresh_leaderboard_btn = gr.Button("πŸ”„ Refresh Data", variant="secondary", size="sm")
607
+ gr.Markdown("*Reload leaderboard from HuggingFace*")
608
+
609
+ gr.Markdown("---")
610
+
611
+ # Filters section
612
+ gr.Markdown("### πŸ” Global Filters")
613
+
614
+ sidebar_model_filter = gr.Dropdown(
615
+ choices=["All Models"],
616
+ value="All Models",
617
+ label="Model",
618
+ info="Filter evaluations by AI model"
619
+ )
620
+
621
+ sidebar_agent_type_filter = gr.Radio(
622
+ choices=["All", "tool", "code", "both"],
623
+ value="All",
624
+ label="Agent Type",
625
+ info="Tool: Function calling | Code: Code execution | Both: Hybrid"
626
+ )
627
+
628
+ # Main content area
629
+ # Screen 1: Main Leaderboard
630
+ with gr.Column(visible=True) as leaderboard_screen:
631
+ gr.Markdown("## πŸ† Agent Evaluation Leaderboard")
632
+ with gr.Tabs():
633
+ with gr.TabItem("πŸ† Leaderboard"):
634
+ # Filters
635
+ with gr.Row():
636
+ model_filter = gr.Dropdown(
637
+ choices=["All Models"],
638
+ value="All Models",
639
+ label="Filter by Model"
640
+ )
641
+ provider_filter = gr.Dropdown(
642
+ choices=["All", "litellm", "transformers"],
643
+ value="All",
644
+ label="Provider"
645
+ )
646
+ sort_by = gr.Dropdown(
647
+ choices=["success_rate", "total_cost_usd", "avg_duration_ms"],
648
+ value="success_rate",
649
+ label="Sort By"
650
+ )
651
+
652
+ apply_filters_btn = gr.Button("πŸ” Apply Filters")
653
+
654
+ # HTML table
655
+ leaderboard_by_model = gr.HTML()
656
+
657
+ with gr.TabItem("πŸ“‹ DrillDown"):
658
+ with gr.Row():
659
+ drilldown_agent_type = gr.Radio(
660
+ choices=["All", "tool", "code", "both"],
661
+ value="All",
662
+ label="Agent Type"
663
+ )
664
+ drilldown_provider = gr.Dropdown(
665
+ choices=["All", "litellm", "transformers"],
666
+ value="All",
667
+ label="Provider"
668
+ )
669
+
670
+ apply_drilldown_btn = gr.Button("πŸ” Apply")
671
+
672
+ leaderboard_table = gr.Dataframe(
673
+ headers=["Run ID", "Model", "Agent Type", "Provider", "Success Rate", "Tests", "Duration", "Cost"],
674
+ interactive=False
675
  )
676
+
677
+ with gr.TabItem("πŸ“ˆ Trends"):
678
+ trends_plot = gr.Plot()
679
+
680
+ with gr.TabItem("πŸ“Š Analytics"):
681
+ viz_type = gr.Radio(
682
+ choices=["πŸ”₯ Performance Heatmap", "⚑ Speed vs Accuracy", "πŸ’° Cost Efficiency"],
683
+ value="πŸ”₯ Performance Heatmap",
684
+ label="Select Visualization"
685
  )
686
+ analytics_chart = gr.Plot()
687
+
688
+ with gr.TabItem("πŸ“₯ Summary Card"):
689
+ top_n_slider = gr.Slider(1, 5, 3, step=1, label="Top N Models")
690
+ generate_card_btn = gr.Button("🎨 Generate Card")
691
+ card_preview = gr.HTML()
692
+
693
+ with gr.TabItem("πŸ€– AI Insights"):
694
+ regenerate_btn = gr.Button("πŸ”„ Regenerate")
695
+ mcp_insights = gr.Markdown("*Loading insights...*")
696
+
697
+ # Hidden textbox for row selection (JavaScript bridge)
698
+ selected_row_index = gr.Textbox(visible=False, elem_id="selected_row_index")
699
+
700
+ # Screen 3: Run Detail
701
+ with gr.Column(visible=False) as run_detail_screen:
702
+ # Navigation
703
+ with gr.Row():
704
+ back_to_leaderboard_btn = gr.Button("⬅️ Back to Leaderboard", variant="secondary", size="sm")
705
+
706
+ # Run metadata display
707
+ run_metadata_html = gr.HTML()
708
+
709
+ # Test cases table
710
+ gr.Markdown("## πŸ“‹ Test Cases")
711
+ test_cases_table = gr.Dataframe(
712
+ headers=["Task ID", "Status", "Tool", "Duration", "Tokens", "Cost", "Trace ID"],
713
+ interactive=False,
714
+ wrap=True
715
+ )
716
+
717
+ # Event handlers
718
+ app.load(
719
  fn=load_leaderboard,
720
  outputs=[leaderboard_by_model, model_filter, sidebar_model_filter]
721
+ )
722
 
723
+ app.load(
724
  fn=load_trends,
725
  outputs=[trends_plot]
726
+ )
727
 
728
+ # Load drilldown data on page load
729
+ app.load(
730
  fn=load_drilldown,
731
  inputs=[drilldown_agent_type, drilldown_provider],
732
  outputs=[leaderboard_table]
733
+ )
734
 
735
+ # Refresh button handler
736
+ refresh_leaderboard_btn.click(
737
+ fn=refresh_leaderboard,
738
+ outputs=[leaderboard_by_model, model_filter, sidebar_model_filter]
739
+ )
740
+
741
+ apply_filters_btn.click(
742
  fn=apply_filters,
743
  inputs=[model_filter, provider_filter, sort_by],
744
  outputs=[leaderboard_by_model]
745
+ )
746
 
747
+ apply_drilldown_btn.click(
748
  fn=load_drilldown,
749
  inputs=[drilldown_agent_type, drilldown_provider],
750
  outputs=[leaderboard_table]
751
+ )
752
 
753
+ # Sidebar filter handlers
754
+ def apply_sidebar_model_filter(model, sort_by_col):
755
+ """Apply sidebar model filter to leaderboard"""
756
+ return apply_filters(model, "All", sort_by_col), gr.update(value=model)
757
 
758
+ sidebar_model_filter.change(
759
  fn=apply_sidebar_model_filter,
760
  inputs=[sidebar_model_filter, sort_by],
761
  outputs=[leaderboard_by_model, model_filter]
762
+ )
763
 
764
+ def apply_sidebar_agent_type_filter(agent_type):
765
+ """Apply sidebar agent type filter to drilldown"""
766
+ return load_drilldown(agent_type, "All"), gr.update(value=agent_type)
767
 
768
+ sidebar_agent_type_filter.change(
769
  fn=apply_sidebar_agent_type_filter,
770
  inputs=[sidebar_agent_type_filter],
771
  outputs=[leaderboard_table, drilldown_agent_type]
772
+ )
773
 
774
+ viz_type.change(
775
  fn=update_analytics,
776
  inputs=[viz_type],
777
  outputs=[analytics_chart]
778
+ )
779
 
780
+ app.load(
781
  fn=update_analytics,
782
  inputs=[viz_type],
783
  outputs=[analytics_chart]
784
+ )
785
 
786
+ generate_card_btn.click(
787
  fn=generate_card,
788
  inputs=[top_n_slider],
789
  outputs=[card_preview]
790
+ )
791
 
792
+ app.load(
793
  fn=generate_insights,
794
  outputs=[mcp_insights]
795
+ )
796
 
797
+ regenerate_btn.click(
798
  fn=generate_insights,
799
  outputs=[mcp_insights]
800
+ )
801
+
802
+
803
+ leaderboard_table.select(
804
+ fn=on_drilldown_select,
805
+ inputs=[leaderboard_table], # Pass dataframe to handler (like MockTraceMind)
806
+ outputs=[leaderboard_screen, run_detail_screen, run_metadata_html, test_cases_table]
807
+ )
808
+
809
+ back_to_leaderboard_btn.click(
810
+ fn=go_back_to_leaderboard,
811
+ inputs=[],
812
+ outputs=[leaderboard_screen, run_detail_screen]
813
+ )
814
+
815
+ # HTML table row click handler (JavaScript bridge via hidden textbox)
816
+ selected_row_index.change(
817
+ fn=on_html_table_row_click,
818
+ inputs=[selected_row_index],
819
+ outputs=[leaderboard_screen, run_detail_screen, run_metadata_html, test_cases_table, selected_row_index]
820
+ )
821
 
822
 
823
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,9 +1,9 @@
1
  # Gradio for UI
2
- gradio[mcp]==5.49.1
3
 
4
  # HuggingFace for dataset loading
5
  datasets>=2.14.0
6
- huggingface-hub>=0.20.0
7
 
8
  # Data processing
9
  pandas>=2.0.0
@@ -16,3 +16,5 @@ requests>=2.31.0
16
 
17
  # Optional: For enhanced visualizations
18
  plotly>=5.18.0
 
 
 
1
  # Gradio for UI
2
+ gradio>=5.0.0
3
 
4
  # HuggingFace for dataset loading
5
  datasets>=2.14.0
6
+ huggingface-hub>=0.26.0
7
 
8
  # Data processing
9
  pandas>=2.0.0
 
16
 
17
  # Optional: For enhanced visualizations
18
  plotly>=5.18.0
19
+ matplotlib>=3.8.0
20
+ hf_xet