File size: 25,191 Bytes
908be6c
 
 
 
 
 
 
 
 
 
d0bd9af
908be6c
 
 
d0bd9af
908be6c
4e4722b
 
 
 
908be6c
 
 
 
 
 
ddbf0ce
908be6c
 
 
 
 
 
b316f4f
 
d0bd9af
 
908be6c
4e4722b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2454bbc
4e4722b
 
2454bbc
 
 
 
 
4e4722b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2454bbc
 
 
 
 
 
4e4722b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2454bbc
4e4722b
 
 
2454bbc
 
4e4722b
 
 
 
 
2454bbc
 
4e4722b
 
 
 
 
2454bbc
4e4722b
 
 
 
 
2454bbc
 
4e4722b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b316f4f
 
 
d0bd9af
b316f4f
 
 
 
 
d0bd9af
b316f4f
 
 
 
908be6c
b316f4f
 
 
 
 
 
d71c62e
 
 
 
b316f4f
 
d0bd9af
b316f4f
 
 
 
 
 
 
 
908be6c
 
b316f4f
 
 
 
908be6c
b316f4f
 
 
 
 
 
908be6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0bd9af
 
 
 
 
908be6c
d0bd9af
908be6c
d0bd9af
908be6c
d0bd9af
 
908be6c
b316f4f
908be6c
 
 
d0bd9af
908be6c
 
 
 
d0bd9af
5930644
908be6c
 
 
b316f4f
d71c62e
 
 
908be6c
 
 
 
 
 
 
 
 
d0bd9af
b316f4f
 
 
 
 
d0bd9af
 
 
 
 
 
 
 
 
 
 
 
b316f4f
908be6c
4e4722b
908be6c
 
 
4e4722b
b316f4f
908be6c
4e4722b
b316f4f
908be6c
 
 
 
4e4722b
 
 
 
 
 
b316f4f
4e4722b
908be6c
 
b316f4f
 
 
 
 
 
 
 
 
 
 
 
4e4722b
 
 
b316f4f
4e4722b
b316f4f
 
4e4722b
 
 
 
 
 
 
 
 
 
 
b316f4f
4e4722b
 
 
 
b316f4f
908be6c
 
4e4722b
 
 
 
 
 
 
b316f4f
908be6c
 
 
 
 
 
 
 
 
 
 
b316f4f
 
 
908be6c
 
 
 
 
d0bd9af
908be6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e4722b
908be6c
 
4e4722b
908be6c
 
4e4722b
 
 
 
 
908be6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e4722b
 
 
 
 
 
 
 
 
 
 
908be6c
 
 
ffe25f1
908be6c
 
 
 
ffe25f1
 
 
 
908be6c
 
 
b316f4f
 
908be6c
b316f4f
4e4722b
908be6c
b316f4f
 
 
908be6c
 
b316f4f
 
 
 
 
 
 
 
908be6c
 
 
 
 
 
 
ffe25f1
 
6022c4b
908be6c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
"""
Chat Screen for TraceMind-AI
Agentic chat interface using smolagents with MCP servers as tools
Demonstrates autonomous Agent behavior for Track 2 submission
"""

import gradio as gr
from typing import List, Tuple, Dict, Any
import json
import os
import yaml

# Smolagents imports
try:
    from smolagents import CodeAgent, InferenceClientModel, LiteLLMModel
    from smolagents.mcp_client import MCPClient
    from smolagents.agent_types import AgentAudio, AgentImage, AgentText
    from smolagents.agents import MultiStepAgent, PlanningStep
    from smolagents.memory import ActionStep, FinalAnswerStep
    from smolagents.models import ChatMessageStreamDelta
    SMOLAGENTS_AVAILABLE = True
except ImportError:
    SMOLAGENTS_AVAILABLE = False
    print("[WARNING] smolagents not installed - Chat screen will use mock agent")

# TraceMind MCP Server endpoint
MCP_SERVER_URL = "https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/sse"

# Model configuration - can be set via environment variables
MODEL_TYPE = os.getenv("AGENT_MODEL_TYPE", "hfapi")  # Options: "hfapi", "inference_client", "litellm"
HF_TOKEN = os.getenv("HF_TOKEN", "")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")

# Global MCP client (shared, stateless connection to MCP server)
# Agent instances are session-specific via gr.State
_global_mcp_client = None


# ============================================================================
# Helper Functions for Agent Step Processing
# ============================================================================

def get_step_footnote_content(step_log: ActionStep | PlanningStep, step_name: str) -> str:
    """Get a footnote string for a step log with duration and token information"""
    step_footnote = f"**{step_name}**"

    # Check if token_usage attribute exists and is not None
    if hasattr(step_log, 'token_usage') and step_log.token_usage is not None:
        step_footnote += f" | Input tokens: {step_log.token_usage.input_tokens:,} | Output tokens: {step_log.token_usage.output_tokens:,}"

    # Add duration information if available
    if hasattr(step_log, 'timing') and step_log.timing and step_log.timing.duration:
        step_footnote += f" | Duration: {round(float(step_log.timing.duration), 2)}s"

    step_footnote_content = f"""<span style="color: #bbbbc2; font-size: 12px;">{step_footnote}</span> """
    return step_footnote_content


def _clean_model_output(model_output: str) -> str:
    """Clean up model output by removing trailing tags and extra backticks."""
    if not model_output:
        return ""
    model_output = model_output.strip()
    # Remove any trailing <end_code> and extra backticks, handling multiple possible formats
    import re
    model_output = re.sub(r"```\s*<end_code>", "```", model_output)
    model_output = re.sub(r"<end_code>\s*```", "```", model_output)
    model_output = re.sub(r"```\s*\n\s*<end_code>", "```", model_output)
    return model_output.strip()


def _format_code_content(content: str) -> str:
    """Format code content as Python code block if it's not already formatted."""
    import re
    content = content.strip()
    # Remove existing code blocks and end_code tags
    content = re.sub(r"```.*?\n", "", content)
    content = re.sub(r"\s*<end_code>\s*", "", content)
    content = content.strip()
    # Add Python code block formatting if not already present
    if not content.startswith("```python"):
        content = f"```python\n{content}\n```"
    return content


def _process_action_step(step_log: ActionStep, skip_model_outputs: bool = False):
    """Process an ActionStep and yield appropriate Gradio ChatMessage objects."""
    import re

    # Output the step number
    step_number = f"πŸ”§ Step {step_log.step_number}"
    if not skip_model_outputs:
        yield gr.ChatMessage(role="assistant", content=f"**{step_number}**", metadata={"status": "done"})

    # First yield the thought/reasoning from the LLM (collapsed)
    if not skip_model_outputs and getattr(step_log, "model_output", ""):
        model_output = _clean_model_output(step_log.model_output)
        yield gr.ChatMessage(
            role="assistant",
            content=model_output,
            metadata={"title": "πŸ’­ Reasoning", "status": "done"}
        )

    # For tool calls, create a parent message
    if getattr(step_log, "tool_calls", []):
        first_tool_call = step_log.tool_calls[0]
        used_code = first_tool_call.name in ["python_interpreter", "execute_code", "final_answer"]

        # Process arguments based on type
        args = first_tool_call.arguments
        if isinstance(args, dict):
            content = str(args.get("answer", str(args)))
        else:
            content = str(args).strip()

        # Format code content if needed
        if used_code and "```" not in content:
            content = _format_code_content(content)

        # Choose appropriate emoji and title based on tool
        tool_emoji = "πŸ› οΈ"
        tool_title = f"Used tool: {first_tool_call.name}"

        # Specific tool icons for TraceMind MCP tools
        if "leaderboard" in first_tool_call.name.lower():
            tool_emoji = "πŸ“Š"
            tool_title = f"Analyzed Leaderboard using {first_tool_call.name}"
        elif "trace" in first_tool_call.name.lower() or "debug" in first_tool_call.name.lower():
            tool_emoji = "πŸ”"
            tool_title = f"Debugged Trace using {first_tool_call.name}"
        elif "cost" in first_tool_call.name.lower() or "estimate" in first_tool_call.name.lower():
            tool_emoji = "πŸ’°"
            tool_title = f"Estimated Cost using {first_tool_call.name}"
        elif used_code:
            tool_emoji = "πŸ’»"
            tool_title = f"Executed Code using {first_tool_call.name}"

        # Create the tool call message
        parent_message_tool = gr.ChatMessage(
            role="assistant",
            content=content,
            metadata={
                "title": f"{tool_emoji} {tool_title}",
                "status": "done",
            },
        )
        yield parent_message_tool

    # Display execution logs if they exist
    if getattr(step_log, "observations", "") and step_log.observations.strip():
        import re
        log_content = step_log.observations.strip()
        if log_content:
            log_content = re.sub(r"^Execution logs:\s*", "", log_content)
            yield gr.ChatMessage(
                role="assistant",
                content=f"```bash\n{log_content}\n```",
                metadata={"title": "πŸ“‹ Execution Logs", "status": "done"},
            )

    # Handle errors
    if getattr(step_log, "error", None):
        error_msg = f"⚠️ **Error:** {str(step_log.error)}"
        yield gr.ChatMessage(
            role="assistant", content=error_msg, metadata={"title": "🚫 Error", "status": "done"}
        )

    # Add step footnote and separator
    yield gr.ChatMessage(
        role="assistant", content=get_step_footnote_content(step_log, step_number), metadata={"status": "done"}
    )
    yield gr.ChatMessage(role="assistant", content="---", metadata={"status": "done"})


def _process_planning_step(step_log: PlanningStep, skip_model_outputs: bool = False):
    """Process a PlanningStep and yield appropriate gradio.ChatMessage objects."""
    if not skip_model_outputs:
        # Show planning phase as collapsible section
        yield gr.ChatMessage(
            role="assistant",
            content=step_log.plan,
            metadata={"title": "🧠 Planning Phase", "status": "done"}
        )
    yield gr.ChatMessage(
        role="assistant", content=get_step_footnote_content(step_log, "Planning Phase"), metadata={"status": "done"}
    )
    yield gr.ChatMessage(role="assistant", content="---", metadata={"status": "done"})


def _process_final_answer_step(step_log: FinalAnswerStep):
    """Process a FinalAnswerStep and yield appropriate gradio.ChatMessage objects."""
    # Try different possible attribute names for the final answer
    final_answer = None
    possible_attrs = ['output', 'answer', 'result', 'content', 'final_answer']

    for attr in possible_attrs:
        if hasattr(step_log, attr):
            final_answer = getattr(step_log, attr)
            break

    # If no known attribute found, use string representation of the step
    if final_answer is None:
        yield gr.ChatMessage(
            role="assistant",
            content=f"**Final answer:** {str(step_log)}",
            metadata={"status": "done"}
        )
        return

    # Process the final answer based on its type (NOT collapsed - visible by default)
    if isinstance(final_answer, AgentText):
        yield gr.ChatMessage(
            role="assistant",
            content=f"πŸ“œ **Final Answer:**\n\n{final_answer.to_string()}",
            metadata={"status": "done"},
        )
    elif isinstance(final_answer, AgentImage):
        # Handle image if needed
        yield gr.ChatMessage(
            role="assistant",
            content=f"🎨 **Image Result:**\n\n![Image]({final_answer.to_string()})",
            metadata={"status": "done"},
        )
    elif isinstance(final_answer, AgentAudio):
        yield gr.ChatMessage(
            role="assistant",
            content={"path": final_answer.to_string(), "mime_type": "audio/wav"},
            metadata={"status": "done"},
        )
    else:
        # Assume markdown content and render as-is
        yield gr.ChatMessage(
            role="assistant",
            content=f"πŸ“œ **Final Answer:**\n\n{str(final_answer)}",
            metadata={"status": "done"},
        )


def pull_messages_from_step(step_log: ActionStep | PlanningStep | FinalAnswerStep, skip_model_outputs: bool = False):
    """Extract Gradio ChatMessage objects from agent steps with proper nesting."""
    if isinstance(step_log, ActionStep):
        yield from _process_action_step(step_log, skip_model_outputs)
    elif isinstance(step_log, PlanningStep):
        yield from _process_planning_step(step_log, skip_model_outputs)
    elif isinstance(step_log, FinalAnswerStep):
        yield from _process_final_answer_step(step_log)
    else:
        raise ValueError(f"Unsupported step type: {type(step_log)}")


def stream_to_gradio(
        agent,
        task: str,
        reset_agent_memory: bool = False,
):
    """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages."""
    intermediate_text = ""

    for event in agent.run(
            task, stream=True, max_steps=20, reset=reset_agent_memory
    ):
        if isinstance(event, ActionStep | PlanningStep | FinalAnswerStep):
            intermediate_text = ""
            for message in pull_messages_from_step(
                    event,
                    skip_model_outputs=getattr(agent, "stream_outputs", False),
            ):
                yield message
        elif isinstance(event, ChatMessageStreamDelta):
            intermediate_text += event.content or ""
            yield intermediate_text


def get_mcp_tools():
    """Get tools from MCP server (shared connection, stateless)"""
    global _global_mcp_client

    # Reuse MCP client connection if already established
    if _global_mcp_client is None:
        try:
            print(f"Connecting to TraceMind MCP Server at {MCP_SERVER_URL}...")
            print(f"Using SSE transport for Gradio MCP server...")

            # For Gradio MCP servers, must specify transport: "sse"
            _global_mcp_client = MCPClient(
                {"url": MCP_SERVER_URL, "transport": "sse"}
            )

            print("Fetching tools from MCP server...")
            tools = _global_mcp_client.get_tools()
            print(f"Received {len(tools)} tools from MCP server")

            # Log available tools
            tool_names = [tool.name for tool in tools]
            print(f"βœ… Connected to TraceMind MCP server")
            print(f"βœ… Received {len(tools)} tools:")
            for tool in tools:
                print(f"   - {tool.name}")

            return tools

        except Exception as e:
            print(f"[ERROR] Connecting to MCP server: {e}")
            import traceback
            traceback.print_exc()
            return []
    else:
        # Return tools from existing connection
        return _global_mcp_client.get_tools()


def create_agent():
    """Create smolagents agent with MCP server tools (per-session instance)"""
    if not SMOLAGENTS_AVAILABLE:
        return None

    try:
        # Get tools from shared MCP connection
        tools = get_mcp_tools()
        if not tools:
            print("[ERROR] No tools available from MCP server")
            return None

        # Create model based on configuration
        if MODEL_TYPE == "inference_client":
            # InferenceClientModel with Nebius provider (DeepSeek-V3)
            model = InferenceClientModel(
                model_id="deepseek-ai/DeepSeek-V3-0324",
                provider="nebius",
                api_key=HF_TOKEN,
            )
            print(f"Using InferenceClientModel: deepseek-ai/DeepSeek-V3-0324 (Nebius)")

        elif MODEL_TYPE == "litellm":
            # LiteLLMModel with Gemini
            model = LiteLLMModel(
                model_id="gemini/gemini-2.5-flash",
                api_key=GEMINI_API_KEY
            )
            print(f"Using LiteLLMModel: gemini/gemini-2.5-flash")

        else:  # Default: hfapi (using InferenceClientModel)
            # InferenceClientModel with Qwen (HF Inference API)
            model = InferenceClientModel(
                model_id='Qwen/Qwen3-Coder-480B-A35B-Instruct',
                token=HF_TOKEN if HF_TOKEN else None,
            )
            print(f"Using InferenceClientModel: Qwen/Qwen3-Coder-480B-A35B-Instruct (HF Inference API)")

        # Load prompt templates from YAML file
        prompt_template_path = os.path.join(os.path.dirname(__file__), "../prompts/code_agent.yaml")
        with open(prompt_template_path, 'r', encoding='utf-8') as stream:
            prompt_templates = yaml.safe_load(stream)

        # Create NEW CodeAgent instance for this session
        agent = CodeAgent(
            tools=[*tools],
            model=model,
            prompt_templates=prompt_templates,
            max_steps=10,
            planning_interval=5,
            additional_authorized_imports=[
                'time', 'math', 'queue', 're', 'stat', 'collections', 'datetime',
                'statistics', 'itertools', 'unicodedata', 'random',
                'pandas', 'numpy', 'json', 'yaml', 'plotly', 'ast'
            ]
        )

        print("βœ… Agent created successfully (session-specific instance)")
        print(f"βœ… Agent has {len(agent.tools)} tools registered:")
        for tool_name in agent.tools.keys():
            print(f"   - {tool_name}")
        return agent

    except Exception as e:
        print(f"[ERROR] Creating agent: {e}")
        import traceback
        traceback.print_exc()
        return None


def cleanup_agent():
    """
    Cleanup MCP client connection (global, shared connection)
    Note: Individual agent instances are garbage collected automatically
    """
    global _global_mcp_client

    if _global_mcp_client is not None:
        try:
            print("Disconnecting MCP client...")
            _global_mcp_client.disconnect()
            print("βœ… MCP client disconnected")
        except Exception as e:
            print(f"[WARNING] Error disconnecting MCP client: {e}")
        finally:
            _global_mcp_client = None


def chat_with_agent(message: str, history: list, agent_state):
    """
    Process user message with agent using streaming

    Args:
        message: User's input message
        history: Chat history (list of ChatMessage objects)
        agent_state: Session-specific agent instance (gr.State)

    Yields:
        Tuple of (updated_history, updated_agent_state)
    """

    if not SMOLAGENTS_AVAILABLE:
        # Mock response for when smolagents isn't available
        history.append(gr.ChatMessage(role="user", content=message, metadata={"status": "done"}))
        history.append(gr.ChatMessage(
            role="assistant",
            content="πŸ€– Agent not available (smolagents not installed). Install with: pip install smolagents",
            metadata={"status": "done"}
        ))
        yield history, agent_state
        return

    try:
        # Create agent if not exists in session state
        if agent_state is None:
            agent_state = create_agent()
            if agent_state is None:
                history.append(gr.ChatMessage(role="user", content=message, metadata={"status": "done"}))
                history.append(gr.ChatMessage(
                    role="assistant",
                    content="❌ Failed to initialize agent",
                    metadata={"status": "done"}
                ))
                yield history, agent_state
                return

        # Add user message
        history.append(gr.ChatMessage(role="user", content=message, metadata={"status": "done"}))
        yield history, agent_state

        # Stream agent responses (agent maintains its own memory across messages in this session)
        for msg in stream_to_gradio(agent_state, task=message, reset_agent_memory=False):
            if isinstance(msg, gr.ChatMessage):
                # Mark previous message as done if it was pending
                if history and history[-1].metadata.get("status") == "pending":
                    history[-1].metadata["status"] = "done"
                history.append(msg)
            elif isinstance(msg, str):  # Streaming text delta
                msg = msg.replace("<", r"\<").replace(">", r"\>")  # HTML tags seem to break Gradio Chatbot
                if history and history[-1].metadata.get("status") == "pending":
                    history[-1].content = msg
                else:
                    history.append(gr.ChatMessage(role="assistant", content=msg, metadata={"status": "pending"}))
            yield history, agent_state

        # Mark final message as done
        if history and history[-1].metadata.get("status") == "pending":
            history[-1].metadata["status"] = "done"
        yield history, agent_state

    except Exception as e:
        import traceback
        error_msg = f"❌ Error: {str(e)}\n\n```\n{traceback.format_exc()}\n```"
        history.append(gr.ChatMessage(
            role="assistant",
            content=error_msg,
            metadata={"title": "🚫 Error", "status": "done"}
        ))
        yield history, agent_state


def create_chat_ui():
    """
    Create the chat screen UI

    Returns:
        Tuple of (screen_column, component_dict)
    """
    components = {}

    # Session-specific agent state (each browser tab gets its own agent instance)
    components['agent_state'] = gr.State(value=None)

    with gr.Column(visible=False) as chat_screen:
        gr.Markdown("# πŸ€– Agent Chat")
        gr.Markdown("*Autonomous AI agent powered by smolagents with MCP tools*")

        # Info banner
        with gr.Accordion("πŸ’‘ About This Agent", open=False):
            gr.Markdown("""
            ### 🎯 What is this?
            This is an **autonomous AI agent** that can:
            - πŸ” **Analyze** evaluation results across the leaderboard
            - πŸ› **Debug** specific traces and identify issues
            - πŸ’° **Estimate** costs for running evaluations
            - 🧠 **Reason** through complex multi-step tasks
            - πŸ› οΈ **Use MCP servers** as tools for data access

            ### πŸš€ Key Features (Track 2 Requirements)
            - βœ… **Autonomous Planning**: Agent decides which tools to use
            - βœ… **Multi-Step Reasoning**: Breaks down complex queries
            - βœ… **MCP Integration**: Uses MCP servers (leaderboard analyzer, trace debugger, cost estimator)
            - βœ… **Tool Execution**: Calls tools based on user intent
            - βœ… **Context Engineering**: Maintains conversation context

            ### πŸ’¬ Example Questions
            - "What are the top 3 performing models and how much do they cost?"
            - "Which model should I use for a cost-sensitive project?"
            - "Estimate the cost of evaluating GPT-4 on 200 tests"
            - "Compare Llama 3.1 vs GPT-4 in terms of speed and cost"
            - "Why would I choose H200 over A10 GPU?"

            ### 🧰 Available Tools (MCP Servers)
            1. **analyze_leaderboard**: Get insights from evaluation data
            2. **debug_trace**: Analyze specific trace executions
            3. **estimate_cost**: Calculate evaluation costs and duration
            """)

        with gr.Row():
            with gr.Column(scale=2):
                # Chat interface (using type="messages" for rich ChatMessage display)
                components['chatbot'] = gr.Chatbot(
                    label="Agent Conversation",
                    type="messages",
                    height=500,
                    show_label=True,
                    show_copy_button=True,
                    avatar_images=(
                        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
                        "https://raw.githubusercontent.com/Mandark-droid/TraceMind-AI/assets/Logo.png"
                    )
                )

                with gr.Row():
                    components['message'] = gr.Textbox(
                        placeholder="Ask me anything about agent evaluations...",
                        label="Your Message",
                        lines=2,
                        scale=4,
                        info="The agent will analyze your question and use appropriate tools"
                    )
                    components['send_btn'] = gr.Button("Send", variant="primary", scale=1)

                with gr.Row():
                    components['clear_btn'] = gr.Button("πŸ—‘οΈ Clear Chat")

            with gr.Column(scale=1):
                # Info panel
                gr.Markdown("### ℹ️ Agent Status")
                gr.Markdown("""
                The agent's reasoning, tool calls, and execution logs are displayed inline in the chat.

                **Look for:**
                - πŸ’­ **Reasoning** - Agent's thought process
                - πŸ› οΈ **Tool Calls** - MCP server invocations
                - πŸ“‹ **Execution Logs** - Tool outputs
                - πŸ“œ **Final Answer** - Agent's response
                """)

                # Quick actions
                gr.Markdown("### ⚑ Quick Actions")
                gr.Markdown("**Basic:**")
                components['quick_analyze'] = gr.Button("πŸ” Analyze Leaderboard", size="sm")
                components['quick_costs'] = gr.Button("πŸ’° Compare Costs", size="sm")
                components['quick_recommend'] = gr.Button("🎯 Get Recommendations", size="sm")

                gr.Markdown("**Advanced:**")
                components['quick_multi_tool'] = gr.Button("πŸ”— Multi-Tool Analysis", size="sm")
                components['quick_synthetic'] = gr.Button("πŸ§ͺ Generate Synthetic Data", size="sm")

    return chat_screen, components


def on_send_message(message, history, agent_state):
    """Handle send button click - now uses streaming with per-session agent"""
    if not message.strip():
        yield history, "", agent_state
        return

    # Stream agent responses with session-specific agent
    for updated_history, updated_agent in chat_with_agent(message, history, agent_state):
        yield updated_history, "", updated_agent


def on_clear_chat(agent_state):
    """
    Handle clear button click
    Note: Does NOT cleanup global MCP connection (shared across sessions)
    Only resets this session's agent instance
    """
    # Return empty history and None agent (will create new agent on next message)
    return [], None


def on_quick_action(action_type):
    """Handle quick action buttons"""
    prompts = {
        "analyze": "Analyze the current leaderboard and show me the top performing models with their costs",
        "costs": "Compare the costs of the top 3 models - which one offers the best value?",
        "recommend": "Based on the leaderboard data, which model would you recommend for a production system that needs both good accuracy and reasonable cost?",
        "multi_tool": "Analyze the leaderboard with focus on cost and accuracy, identify the top 2 models, compare them, and estimate the cost of running 500 evaluations on the cheaper one",
        "synthetic": "Generate a synthetic test dataset with 100 tasks for the food-delivery domain using these tools: search_restaurants, view_menu, place_order, track_delivery, apply_promo, rate_restaurant, contact_driver with difficulty_distribution='balanced' and agent_type='both'. Then create a prompt template for the same domain and tools using agent_type='tool', and push the dataset to MCP-1st-Birthday/smoltrace-food-delivery-tasks-v2"
    }
    return prompts.get(action_type, "")