|
|
""" |
|
|
TraceMind MCP Server - Hugging Face Space Entry Point (Track 1) |
|
|
|
|
|
This file serves as the entry point for HuggingFace Space deployment. |
|
|
Exposes 11 AI-powered MCP tools + 3 Resources + 3 Prompts via Gradio's native MCP support. |
|
|
|
|
|
Built on Open Source Foundation: |
|
|
π TraceVerde (genai_otel_instrument) - Automatic OpenTelemetry instrumentation |
|
|
for LLM frameworks (LiteLLM, Transformers, LangChain, etc.) |
|
|
GitHub: https://github.com/Mandark-droid/genai_otel_instrument |
|
|
PyPI: https://pypi.org/project/genai-otel-instrument |
|
|
|
|
|
π SMOLTRACE - Agent evaluation engine with OTEL tracing built-in |
|
|
Generates structured datasets (leaderboard, results, traces, metrics) |
|
|
GitHub: https://github.com/Mandark-droid/SMOLTRACE |
|
|
PyPI: https://pypi.org/project/smoltrace/ |
|
|
|
|
|
The Flow: TraceVerde instruments β SMOLTRACE evaluates β TraceMind analyzes |
|
|
|
|
|
Architecture: |
|
|
User β MCP Client (Claude Desktop, Continue, Cline, etc.) |
|
|
β MCP Endpoint (Gradio SSE) |
|
|
β TraceMind MCP Server (this file) |
|
|
β Tools (mcp_tools.py) |
|
|
β Google Gemini 2.5 Flash API |
|
|
|
|
|
For Track 1: Building MCP Servers - Enterprise Category |
|
|
https://huggingface.co/MCP-1st-Birthday |
|
|
|
|
|
Tools Provided: |
|
|
π analyze_leaderboard - AI-powered leaderboard analysis |
|
|
π debug_trace - Debug agent execution traces with AI |
|
|
π° estimate_cost - Predict evaluation costs before running |
|
|
βοΈ compare_runs - Compare evaluation runs with AI analysis |
|
|
π analyze_results - Analyze detailed test results with optimization recommendations |
|
|
π get_top_performers - Get top N models from leaderboard (optimized) |
|
|
π get_leaderboard_summary - Get leaderboard overview statistics |
|
|
π¦ get_dataset - Load SMOLTRACE datasets as JSON |
|
|
π§ͺ generate_synthetic_dataset - Create domain-specific test datasets |
|
|
π generate_prompt_template - Generate customized smolagents prompt templates |
|
|
π€ push_dataset_to_hub - Upload datasets to HuggingFace Hub |
|
|
|
|
|
Compatible with: |
|
|
- Claude Desktop (via Gradio MCP support) |
|
|
- Continue.dev (VS Code extension) |
|
|
- Cline (VS Code extension) |
|
|
- Any MCP client supporting Gradio's MCP protocol |
|
|
""" |
|
|
|
|
|
import os |
|
|
import logging |
|
|
import gradio as gr |
|
|
from typing import Optional, Dict, Any |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|
|
handlers=[logging.StreamHandler()] |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
from gemini_client import GeminiClient |
|
|
from mcp_tools import ( |
|
|
analyze_leaderboard, |
|
|
debug_trace, |
|
|
estimate_cost, |
|
|
compare_runs, |
|
|
analyze_results, |
|
|
get_top_performers, |
|
|
get_leaderboard_summary, |
|
|
get_dataset, |
|
|
generate_synthetic_dataset, |
|
|
generate_prompt_template, |
|
|
push_dataset_to_hub |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
default_gemini_client = GeminiClient() |
|
|
except ValueError: |
|
|
default_gemini_client = None |
|
|
|
|
|
|
|
|
def create_gradio_ui(): |
|
|
"""Create Gradio UI for testing MCP tools""" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="TraceMind MCP Server") as demo: |
|
|
|
|
|
gr.HTML(""" |
|
|
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
padding: 25px; |
|
|
border-radius: 10px; |
|
|
margin-bottom: 20px; |
|
|
text-align: center; |
|
|
box-shadow: 0 4px 6px rgba(0,0,0,0.1);"> |
|
|
<h1 style="color: white !important; margin: 0; font-size: 2.5em; font-weight: bold;"> |
|
|
π€ TraceMind MCP Server |
|
|
</h1> |
|
|
<p style="color: rgba(255,255,255,0.9); margin: 10px 0 0 0; font-size: 1.2em;"> |
|
|
AI-Powered Analysis for Agent Evaluation |
|
|
</p> |
|
|
<p style="color: rgba(255,255,255,0.8); margin: 10px 0 0 0; font-size: 0.9em;"> |
|
|
Powered by Gemini | Gradio | TraceVerde | SMOLTRACE | HuggingFace | OpenTelemetry | MCP |
|
|
</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
gr.Markdown(""" |
|
|
**Track 1 Submission**: Building MCP (Enterprise) |
|
|
|
|
|
*AI-powered MCP server providing 11 tools, 3 resources, and 3 prompts for agent evaluation analysis.* |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Accordion("π The TraceMind Ecosystem", open=False): |
|
|
gr.Markdown(""" |
|
|
### Complete Agent Evaluation Platform |
|
|
|
|
|
TraceMind MCP Server is part of a 4-project ecosystem for comprehensive agent evaluation: |
|
|
|
|
|
#### π TraceVerde (genai_otel_instrument) |
|
|
**Foundation: OpenTelemetry Instrumentation** |
|
|
- Zero-code OTEL instrumentation for LLM frameworks |
|
|
- Automatically captures every LLM call, tool usage, and agent step |
|
|
- Works with LiteLLM, Transformers, LangChain, CrewAI, and more |
|
|
- [GitHub](https://github.com/Mandark-droid/genai_otel_instrument) | [PyPI](https://pypi.org/project/genai-otel-instrument) |
|
|
|
|
|
#### π SMOLTRACE |
|
|
**Foundation: Evaluation Engine** |
|
|
- Lightweight agent evaluation engine with built-in tracing |
|
|
- Generates structured datasets (leaderboard, results, traces, metrics) |
|
|
- Supports both API models (via LiteLLM) and local models (via Transformers) |
|
|
- [GitHub](https://github.com/Mandark-droid/SMOLTRACE) | [PyPI](https://pypi.org/project/smoltrace/) |
|
|
|
|
|
#### π€ TraceMind MCP Server (This Project) |
|
|
**Track 1: Building MCP (Enterprise)** |
|
|
- Provides AI-powered MCP tools for analyzing evaluation data |
|
|
- Uses Google Gemini 2.5 Flash for intelligent insights |
|
|
- 11 tools + 3 resources + 3 prompts |
|
|
- [HF Space](https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind-mcp-server) |
|
|
|
|
|
#### π§ TraceMind-AI |
|
|
**Track 2: MCP in Action (Enterprise)** |
|
|
- Interactive UI that consumes MCP tools from this server |
|
|
- Leaderboard visualization with AI-powered insights |
|
|
- Autonomous agent chat powered by MCP tools |
|
|
- Multi-cloud job submission (HuggingFace Jobs + Modal) |
|
|
- [HF Space](https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind) |
|
|
|
|
|
### The Flow |
|
|
``` |
|
|
TraceVerde β SMOLTRACE β Datasets |
|
|
β |
|
|
TraceMind MCP Server (AI Tools) |
|
|
β |
|
|
TraceMind-AI (UI + Agent) |
|
|
``` |
|
|
|
|
|
**Built for**: MCP's 1st Birthday Hackathon (Nov 14-30, 2025) |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Accordion("π About This MCP Server", open=False): |
|
|
gr.Markdown(""" |
|
|
### What is This? |
|
|
|
|
|
TraceMind MCP Server provides intelligent analysis tools for agent evaluation data through the Model Context Protocol (MCP). |
|
|
|
|
|
**Powered by**: Google Gemini 2.5 Flash |
|
|
|
|
|
**π¬ [Quick Demo (5 min)](https://www.loom.com/share/d4d0003f06fa4327b46ba5c081bdf835)** | **πΊ [Full Demo (20 min)](https://www.loom.com/share/de559bb0aef749559c79117b7f951250)** |
|
|
|
|
|
### MCP Tools (11 Available) |
|
|
- π **Analyze Leaderboard** - AI-powered insights from evaluation results |
|
|
- π **Debug Trace** - Understand agent execution with AI debugging |
|
|
- π° **Estimate Cost** - Predict evaluation costs with AI recommendations |
|
|
- βοΈ **Compare Runs** - Compare evaluation runs with AI analysis |
|
|
- π **Analyze Results** - Deep dive into test results |
|
|
- π **Get Top Performers** - Quick leaderboard queries (optimized) |
|
|
- π **Get Leaderboard Summary** - High-level statistics (optimized) |
|
|
- π¦ **Get Dataset** - Load any HuggingFace dataset as JSON |
|
|
- π§ͺ **Generate Synthetic Dataset** - Create domain-specific test datasets |
|
|
- π **Generate Prompt Template** - Create customized smolagents prompts |
|
|
- π€ **Push to Hub** - Upload datasets to HuggingFace Hub |
|
|
|
|
|
### MCP Resources (3 Available) |
|
|
- π `leaderboard://{repo}` - Raw leaderboard data |
|
|
- π `trace://{trace_id}/{repo}` - Raw trace data |
|
|
- π° `cost://model/{model_name}` - Model pricing data |
|
|
|
|
|
### MCP Prompts (3 Templates) |
|
|
- π `analysis_prompt` - Analysis request templates |
|
|
- π `debug_prompt` - Debugging trace templates |
|
|
- β‘ `optimization_prompt` - Optimization recommendation templates |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Accordion("π MCP Connection Details", open=False): |
|
|
gr.Markdown(""" |
|
|
### Connect Your MCP Client |
|
|
|
|
|
**HuggingFace Space**: |
|
|
``` |
|
|
https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind-mcp-server |
|
|
``` |
|
|
|
|
|
**MCP Endpoint (SSE - Recommended)**: |
|
|
``` |
|
|
https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/sse |
|
|
``` |
|
|
|
|
|
**MCP Endpoint (Streamable HTTP)**: |
|
|
``` |
|
|
https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/ |
|
|
``` |
|
|
|
|
|
### Supported Clients |
|
|
- Claude Desktop |
|
|
- Continue.dev |
|
|
- Cline |
|
|
- Any MCP-compatible client |
|
|
""") |
|
|
|
|
|
gr.Markdown("---") |
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("π Analyze Leaderboard"): |
|
|
gr.Markdown("### Get AI-powered insights from evaluation leaderboard") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
lb_repo = gr.Textbox( |
|
|
label="Leaderboard Repository", |
|
|
value="kshitijthakkar/smoltrace-leaderboard", |
|
|
placeholder="username/dataset-name" |
|
|
) |
|
|
lb_metric = gr.Dropdown( |
|
|
label="Metric Focus", |
|
|
choices=["overall", "accuracy", "cost", "latency", "co2"], |
|
|
value="overall" |
|
|
) |
|
|
lb_time = gr.Dropdown( |
|
|
label="Time Range", |
|
|
choices=["last_week", "last_month", "all_time"], |
|
|
value="last_week" |
|
|
) |
|
|
lb_top_n = gr.Slider( |
|
|
label="Top N Models", |
|
|
minimum=3, |
|
|
maximum=10, |
|
|
value=5, |
|
|
step=1 |
|
|
) |
|
|
lb_button = gr.Button("π Analyze", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
lb_output = gr.Markdown(label="Analysis Results") |
|
|
|
|
|
async def run_analyze_leaderboard(repo, metric, time_range, top_n): |
|
|
""" |
|
|
Analyze agent evaluation leaderboard and generate AI-powered insights. |
|
|
|
|
|
This tool loads agent evaluation data from HuggingFace datasets and uses |
|
|
Google Gemini 2.5 Flash to provide intelligent analysis of top performers, |
|
|
trends, cost/performance trade-offs, and actionable recommendations. |
|
|
|
|
|
Args: |
|
|
repo (str): HuggingFace dataset repository containing leaderboard data |
|
|
metric (str): Primary metric to focus analysis on - "overall", "accuracy", "cost", "latency", or "co2" |
|
|
time_range (str): Time range for analysis - "last_week", "last_month", or "all_time" |
|
|
top_n (int): Number of top models to highlight in analysis (3-10) |
|
|
gemini_key (str): Gemini API key from session state |
|
|
hf_token (str): HuggingFace token from session state |
|
|
|
|
|
Returns: |
|
|
str: Markdown-formatted analysis with top performers, trends, and recommendations |
|
|
""" |
|
|
try: |
|
|
result = await analyze_leaderboard( |
|
|
leaderboard_repo=repo, |
|
|
metric_focus=metric, |
|
|
time_range=time_range, |
|
|
top_n=int(top_n) |
|
|
) |
|
|
return result |
|
|
except Exception as e: |
|
|
return f"β **Error**: {str(e)}" |
|
|
|
|
|
lb_button.click( |
|
|
fn=run_analyze_leaderboard, |
|
|
inputs=[lb_repo, lb_metric, lb_time, lb_top_n], |
|
|
outputs=[lb_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π Debug Trace"): |
|
|
gr.Markdown("### Ask questions about specific agent execution traces") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
trace_id = gr.Textbox( |
|
|
label="Trace ID", |
|
|
placeholder="trace_abc123", |
|
|
info="Get this from the Run Detail screen" |
|
|
) |
|
|
traces_repo = gr.Textbox( |
|
|
label="Traces Repository", |
|
|
placeholder="username/agent-traces-model-timestamp", |
|
|
info="Dataset containing trace data" |
|
|
) |
|
|
question = gr.Textbox( |
|
|
label="Your Question", |
|
|
placeholder="Why was tool X called twice?", |
|
|
lines=3 |
|
|
) |
|
|
trace_button = gr.Button("π Analyze", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
trace_output = gr.Markdown(label="Debug Analysis") |
|
|
|
|
|
async def run_debug_trace(trace_id_val, traces_repo_val, question_val): |
|
|
""" |
|
|
Debug a specific agent execution trace using OpenTelemetry data. |
|
|
|
|
|
This tool analyzes OpenTelemetry trace data from agent executions and uses |
|
|
Google Gemini 2.5 Flash to answer specific questions about the execution flow, |
|
|
identify bottlenecks, explain agent behavior, and provide debugging insights. |
|
|
|
|
|
Args: |
|
|
trace_id_val (str): Unique identifier for the trace to analyze (e.g., "trace_abc123") |
|
|
traces_repo_val (str): HuggingFace dataset repository containing trace data |
|
|
question_val (str): Specific question about the trace (optional, defaults to general analysis) |
|
|
gemini_key (str): Gemini API key from session state |
|
|
hf_token (str): HuggingFace token from session state |
|
|
|
|
|
Returns: |
|
|
str: Markdown-formatted debug analysis with step-by-step breakdown and answers |
|
|
""" |
|
|
try: |
|
|
if not trace_id_val or not traces_repo_val: |
|
|
return "β **Error**: Please provide both Trace ID and Traces Repository" |
|
|
|
|
|
result = await debug_trace( |
|
|
trace_id=trace_id_val, |
|
|
traces_repo=traces_repo_val, |
|
|
question=question_val or "Analyze this trace") |
|
|
return result |
|
|
except Exception as e: |
|
|
return f"β **Error**: {str(e)}" |
|
|
|
|
|
trace_button.click( |
|
|
fn=run_debug_trace, |
|
|
inputs=[trace_id, traces_repo, question], |
|
|
outputs=[trace_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π° Estimate Cost"): |
|
|
gr.Markdown("### Predict evaluation costs before running") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
cost_model = gr.Textbox( |
|
|
label="Model", |
|
|
placeholder="openai/gpt-4 or meta-llama/Llama-3.1-8B", |
|
|
info="Use litellm format (provider/model)" |
|
|
) |
|
|
cost_agent_type = gr.Dropdown( |
|
|
label="Agent Type", |
|
|
choices=["tool", "code", "both"], |
|
|
value="both" |
|
|
) |
|
|
cost_num_tests = gr.Slider( |
|
|
label="Number of Tests", |
|
|
minimum=10, |
|
|
maximum=1000, |
|
|
value=100, |
|
|
step=10 |
|
|
) |
|
|
cost_hardware = gr.Dropdown( |
|
|
label="Hardware Type", |
|
|
choices=[ |
|
|
"auto", |
|
|
|
|
|
"cpu", "gpu_t4", "gpu_l4", "gpu_a10", "gpu_l40s", |
|
|
"gpu_a100", "gpu_a100_80gb", "gpu_h100", "gpu_h200", "gpu_b200", |
|
|
|
|
|
"cpu-basic", "cpu-upgrade", |
|
|
"t4-small", "t4-medium", |
|
|
"l4x1", "l4x4", |
|
|
"a10g-small", "a10g-large", "a10g-largex2", "a10g-largex4", |
|
|
"a100-large", |
|
|
"v5e-1x1", "v5e-2x2", "v5e-2x4" |
|
|
], |
|
|
value="auto", |
|
|
info="Supports Modal and HuggingFace Jobs hardware. 'auto' selects cpu-basic (API) or a10g-small (local)." |
|
|
) |
|
|
cost_button = gr.Button("π° Estimate", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
cost_output = gr.Markdown(label="Cost Estimate") |
|
|
|
|
|
async def run_estimate_cost(model, agent_type, num_tests, hardware): |
|
|
""" |
|
|
Estimate the cost, duration, and CO2 emissions of running agent evaluations. |
|
|
|
|
|
This tool predicts costs before running evaluations by calculating LLM API costs, |
|
|
HuggingFace Jobs compute costs, and CO2 emissions. Uses Google Gemini 2.5 Flash |
|
|
to provide detailed cost breakdown and optimization recommendations. |
|
|
|
|
|
Args: |
|
|
model (str): Model identifier in litellm format (e.g., "openai/gpt-4", "meta-llama/Llama-3.1-8B") |
|
|
agent_type (str): Type of agent capabilities to test - "tool", "code", or "both" |
|
|
num_tests (int): Number of test cases to run (10-1000) |
|
|
hardware (str): Hardware type for HF Jobs - "auto", "cpu", "gpu_a10", or "gpu_h200" |
|
|
gemini_key (str): Gemini API key from session state |
|
|
|
|
|
Returns: |
|
|
str: Markdown-formatted cost estimate with LLM costs, HF Jobs costs, duration, CO2, and tips |
|
|
""" |
|
|
try: |
|
|
if not model: |
|
|
return "β **Error**: Please provide a model name" |
|
|
|
|
|
result = await estimate_cost( |
|
|
model=model, |
|
|
agent_type=agent_type, |
|
|
num_tests=int(num_tests), |
|
|
hardware=hardware |
|
|
) |
|
|
return result |
|
|
except Exception as e: |
|
|
return f"β **Error**: {str(e)}" |
|
|
|
|
|
cost_button.click( |
|
|
fn=run_estimate_cost, |
|
|
inputs=[cost_model, cost_agent_type, cost_num_tests, cost_hardware], |
|
|
outputs=[cost_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("βοΈ Compare Runs"): |
|
|
gr.Markdown(""" |
|
|
## Compare Two Evaluation Runs |
|
|
|
|
|
Compare two evaluation runs with AI-powered analysis across multiple dimensions: |
|
|
success rate, cost efficiency, speed, environmental impact, and more. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
compare_run_id_1 = gr.Textbox( |
|
|
label="First Run ID", |
|
|
placeholder="e.g., run_abc123", |
|
|
info="Enter the run_id from the leaderboard" |
|
|
) |
|
|
with gr.Column(): |
|
|
compare_run_id_2 = gr.Textbox( |
|
|
label="Second Run ID", |
|
|
placeholder="e.g., run_xyz789", |
|
|
info="Enter the run_id to compare against" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
compare_focus = gr.Dropdown( |
|
|
choices=["comprehensive", "cost", "performance", "eco_friendly"], |
|
|
value="comprehensive", |
|
|
label="Comparison Focus", |
|
|
info="Choose what aspect to focus the comparison on" |
|
|
) |
|
|
compare_repo = gr.Textbox( |
|
|
label="Leaderboard Repository", |
|
|
value="kshitijthakkar/smoltrace-leaderboard", |
|
|
info="HuggingFace dataset containing leaderboard data" |
|
|
) |
|
|
|
|
|
compare_button = gr.Button("π Compare Runs", variant="primary") |
|
|
compare_output = gr.Markdown() |
|
|
|
|
|
async def run_compare_runs(run_id_1, run_id_2, focus, repo): |
|
|
""" |
|
|
Compare two evaluation runs and generate AI-powered comparative analysis. |
|
|
|
|
|
This tool fetches data for two evaluation runs from the leaderboard and uses |
|
|
Google Gemini 2.5 Flash to provide intelligent comparison across multiple dimensions: |
|
|
success rate, cost efficiency, speed, environmental impact, and use case recommendations. |
|
|
|
|
|
Args: |
|
|
run_id_1 (str): First run ID from the leaderboard to compare |
|
|
run_id_2 (str): Second run ID from the leaderboard to compare against |
|
|
focus (str): Focus area - "comprehensive", "cost", "performance", or "eco_friendly" |
|
|
repo (str): HuggingFace dataset repository containing leaderboard data |
|
|
gemini_key (str): Gemini API key from session state |
|
|
hf_token (str): HuggingFace token from session state |
|
|
|
|
|
Returns: |
|
|
str: Markdown-formatted comparative analysis with winners, trade-offs, and recommendations |
|
|
""" |
|
|
try: |
|
|
result = await compare_runs( |
|
|
run_id_1=run_id_1, |
|
|
run_id_2=run_id_2, |
|
|
leaderboard_repo=repo, |
|
|
comparison_focus=focus |
|
|
) |
|
|
return result |
|
|
except Exception as e: |
|
|
return f"β **Error**: {str(e)}" |
|
|
|
|
|
compare_button.click( |
|
|
fn=run_compare_runs, |
|
|
inputs=[compare_run_id_1, compare_run_id_2, compare_focus, compare_repo], |
|
|
outputs=[compare_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π Analyze Results"): |
|
|
gr.Markdown(""" |
|
|
## Analyze Test Results & Get Optimization Recommendations |
|
|
|
|
|
Deep dive into individual test case results to identify failure patterns, |
|
|
performance bottlenecks, and cost optimization opportunities. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
results_repo_input = gr.Textbox( |
|
|
label="Results Repository", |
|
|
placeholder="e.g., username/smoltrace-results-gpt4-20251114", |
|
|
info="HuggingFace dataset containing results data" |
|
|
) |
|
|
results_focus = gr.Dropdown( |
|
|
choices=["comprehensive", "failures", "performance", "cost"], |
|
|
value="comprehensive", |
|
|
label="Analysis Focus", |
|
|
info="What aspect to focus the analysis on" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
results_max_rows = gr.Slider( |
|
|
minimum=10, |
|
|
maximum=500, |
|
|
value=100, |
|
|
step=10, |
|
|
label="Max Test Cases to Analyze", |
|
|
info="Limit number of test cases for analysis" |
|
|
) |
|
|
|
|
|
results_button = gr.Button("π Analyze Results", variant="primary") |
|
|
results_output = gr.Markdown() |
|
|
|
|
|
async def run_analyze_results(repo, focus, max_rows): |
|
|
""" |
|
|
Analyze detailed test results and provide optimization recommendations. |
|
|
|
|
|
Args: |
|
|
repo (str): HuggingFace dataset repository containing results |
|
|
focus (str): Analysis focus area |
|
|
max_rows (int): Maximum test cases to analyze |
|
|
gemini_key (str): Gemini API key from session state |
|
|
hf_token (str): HuggingFace token from session state |
|
|
|
|
|
Returns: |
|
|
str: Markdown-formatted analysis with recommendations |
|
|
""" |
|
|
try: |
|
|
if not repo: |
|
|
return "β **Error**: Please provide a results repository" |
|
|
|
|
|
result = await analyze_results( |
|
|
results_repo=repo, |
|
|
analysis_focus=focus, |
|
|
max_rows=int(max_rows) |
|
|
) |
|
|
return result |
|
|
except Exception as e: |
|
|
return f"β **Error**: {str(e)}" |
|
|
|
|
|
results_button.click( |
|
|
fn=run_analyze_results, |
|
|
inputs=[results_repo_input, results_focus, results_max_rows], |
|
|
outputs=[results_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π Get Top Performers"): |
|
|
gr.Markdown(""" |
|
|
## Get Top Performing Models (Token-Optimized) |
|
|
|
|
|
Quickly retrieve top N models from the leaderboard without loading all runs. |
|
|
**90% token reduction** compared to loading the full leaderboard dataset. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
top_perf_repo = gr.Textbox( |
|
|
label="Leaderboard Repository", |
|
|
value="kshitijthakkar/smoltrace-leaderboard", |
|
|
placeholder="username/dataset-name" |
|
|
) |
|
|
top_perf_metric = gr.Dropdown( |
|
|
label="Ranking Metric", |
|
|
choices=["success_rate", "total_cost_usd", "avg_duration_ms", "co2_emissions_g"], |
|
|
value="success_rate", |
|
|
info="Metric to rank models by" |
|
|
) |
|
|
top_perf_n = gr.Slider( |
|
|
label="Top N Models", |
|
|
minimum=1, |
|
|
maximum=20, |
|
|
value=5, |
|
|
step=1, |
|
|
info="Number of top models to return" |
|
|
) |
|
|
top_perf_button = gr.Button("π Get Top Performers", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
top_perf_output = gr.JSON(label="Top Performers (JSON)") |
|
|
|
|
|
async def run_get_top_performers(repo, metric, top_n): |
|
|
"""Get top performing models from leaderboard.""" |
|
|
try: |
|
|
import json |
|
|
result = await get_top_performers( |
|
|
leaderboard_repo=repo, |
|
|
metric=metric, |
|
|
top_n=int(top_n) |
|
|
) |
|
|
return json.loads(result) |
|
|
except Exception as e: |
|
|
return {"error": str(e)} |
|
|
|
|
|
top_perf_button.click( |
|
|
fn=run_get_top_performers, |
|
|
inputs=[top_perf_repo, top_perf_metric, top_perf_n], |
|
|
outputs=[top_perf_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π Get Leaderboard Summary"): |
|
|
gr.Markdown(""" |
|
|
## Get Leaderboard Overview Statistics (Token-Optimized) |
|
|
|
|
|
Get high-level summary statistics without loading individual runs. |
|
|
**99% token reduction** compared to loading the full leaderboard dataset. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
summary_repo = gr.Textbox( |
|
|
label="Leaderboard Repository", |
|
|
value="kshitijthakkar/smoltrace-leaderboard", |
|
|
placeholder="username/dataset-name" |
|
|
) |
|
|
summary_button = gr.Button("π Get Summary", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
summary_output = gr.JSON(label="Leaderboard Summary (JSON)") |
|
|
|
|
|
async def run_get_leaderboard_summary(repo): |
|
|
"""Get leaderboard summary statistics.""" |
|
|
try: |
|
|
import json |
|
|
result = await get_leaderboard_summary(leaderboard_repo=repo) |
|
|
return json.loads(result) |
|
|
except Exception as e: |
|
|
return {"error": str(e)} |
|
|
|
|
|
summary_button.click( |
|
|
fn=run_get_leaderboard_summary, |
|
|
inputs=[summary_repo], |
|
|
outputs=[summary_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π¦ Get Dataset"): |
|
|
gr.Markdown(""" |
|
|
## Load SMOLTRACE Datasets as JSON |
|
|
|
|
|
This tool loads datasets with the **smoltrace-** prefix and returns the raw data as JSON. |
|
|
Use this to access leaderboard data, results datasets, traces datasets, or metrics datasets. |
|
|
|
|
|
**Restriction**: Only datasets with "smoltrace-" in the name are allowed for security. |
|
|
|
|
|
**Tip**: If you don't know which dataset to load, first load the leaderboard to see |
|
|
dataset references in the `results_dataset`, `traces_dataset`, `metrics_dataset` fields. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
dataset_repo_input = gr.Textbox( |
|
|
label="Dataset Repository (must contain 'smoltrace-')", |
|
|
placeholder="e.g., kshitijthakkar/smoltrace-leaderboard", |
|
|
value="kshitijthakkar/smoltrace-leaderboard", |
|
|
info="HuggingFace dataset repository path with smoltrace- prefix" |
|
|
) |
|
|
dataset_max_rows = gr.Slider( |
|
|
minimum=1, |
|
|
maximum=200, |
|
|
value=50, |
|
|
step=1, |
|
|
label="Max Rows", |
|
|
info="Limit rows to avoid token limits" |
|
|
) |
|
|
|
|
|
dataset_button = gr.Button("π₯ Load Dataset", variant="primary") |
|
|
dataset_output = gr.JSON(label="Dataset JSON Output") |
|
|
|
|
|
async def run_get_dataset(repo, max_rows): |
|
|
""" |
|
|
Load SMOLTRACE datasets from HuggingFace and return as JSON. |
|
|
|
|
|
This tool loads datasets with the "smoltrace-" prefix and returns the raw data |
|
|
as JSON. Use this to access leaderboard data, results datasets, traces datasets, |
|
|
or metrics datasets. Only datasets with "smoltrace-" in the name are allowed. |
|
|
|
|
|
Args: |
|
|
repo (str): HuggingFace dataset repository path with "smoltrace-" prefix (e.g., "kshitijthakkar/smoltrace-leaderboard") |
|
|
max_rows (int): Maximum number of rows to return (1-200, default 50) |
|
|
hf_token (str): HuggingFace token from session state |
|
|
|
|
|
Returns: |
|
|
dict: JSON object with dataset data, metadata, total rows, and column names |
|
|
""" |
|
|
try: |
|
|
import json |
|
|
result = await get_dataset( |
|
|
dataset_repo=repo, |
|
|
max_rows=int(max_rows) |
|
|
) |
|
|
|
|
|
return json.loads(result) |
|
|
except Exception as e: |
|
|
return {"error": str(e)} |
|
|
|
|
|
dataset_button.click( |
|
|
fn=run_get_dataset, |
|
|
inputs=[dataset_repo_input, dataset_max_rows], |
|
|
outputs=[dataset_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π§ͺ Generate Synthetic Dataset"): |
|
|
gr.Markdown(""" |
|
|
## Create Domain-Specific Test Datasets for SMOLTRACE |
|
|
|
|
|
Use AI to generate synthetic evaluation tasks tailored to your domain and tools. |
|
|
Perfect for creating custom benchmarks when standard datasets don't fit your use case. |
|
|
|
|
|
**π― Enterprise Use Case**: Quickly create evaluation datasets for: |
|
|
- Custom tools and APIs your agents use |
|
|
- Industry-specific domains (finance, healthcare, legal, etc.) |
|
|
- Internal workflows and processes |
|
|
- Specialized agent capabilities |
|
|
|
|
|
**Output Format**: SMOLTRACE-compatible task dataset ready for HuggingFace upload |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
synth_domain = gr.Textbox( |
|
|
label="Domain", |
|
|
placeholder="e.g., finance, healthcare, travel, ecommerce, customer_support", |
|
|
value="travel", |
|
|
info="The domain/industry for your synthetic tasks" |
|
|
) |
|
|
synth_tools = gr.Textbox( |
|
|
label="Tool Names (comma-separated)", |
|
|
placeholder="e.g., get_weather,search_flights,book_hotel,currency_converter", |
|
|
value="get_weather,search_flights,book_hotel", |
|
|
info="Names of tools your agent can use", |
|
|
lines=2 |
|
|
) |
|
|
synth_num_tasks = gr.Slider( |
|
|
label="Number of Tasks", |
|
|
minimum=5, |
|
|
maximum=100, |
|
|
value=10, |
|
|
step=1, |
|
|
info="Total number of synthetic tasks to generate" |
|
|
) |
|
|
synth_difficulty = gr.Dropdown( |
|
|
label="Difficulty Distribution", |
|
|
choices=["balanced", "easy_only", "medium_only", "hard_only", "progressive"], |
|
|
value="balanced", |
|
|
info="How to distribute task difficulty" |
|
|
) |
|
|
synth_agent_type = gr.Dropdown( |
|
|
label="Agent Type", |
|
|
choices=["both", "tool", "code"], |
|
|
value="both", |
|
|
info="Target agent type for the tasks" |
|
|
) |
|
|
synth_button = gr.Button("π§ͺ Generate Synthetic Dataset", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(): |
|
|
synth_output = gr.JSON(label="Generated Dataset (JSON)") |
|
|
|
|
|
gr.Markdown(""" |
|
|
### π Next Steps |
|
|
|
|
|
After generation: |
|
|
1. **Copy the `tasks` array** from the JSON output above |
|
|
2. **Use the "Push to Hub" tab** to upload directly to HuggingFace |
|
|
3. **Or upload manually** following the instructions in the output |
|
|
|
|
|
**π‘ Tip**: The generated dataset includes usage instructions and follows SMOLTRACE naming convention! |
|
|
""") |
|
|
|
|
|
async def run_generate_synthetic(domain, tools, num_tasks, difficulty, agent_type): |
|
|
"""Generate synthetic dataset with async support.""" |
|
|
try: |
|
|
import json |
|
|
result = await generate_synthetic_dataset( |
|
|
domain=domain, |
|
|
tool_names=tools, |
|
|
num_tasks=int(num_tasks), |
|
|
difficulty_distribution=difficulty, |
|
|
agent_type=agent_type |
|
|
) |
|
|
return json.loads(result) |
|
|
except Exception as e: |
|
|
return {"error": str(e)} |
|
|
|
|
|
synth_button.click( |
|
|
fn=run_generate_synthetic, |
|
|
inputs=[synth_domain, synth_tools, synth_num_tasks, synth_difficulty, synth_agent_type], |
|
|
outputs=[synth_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π Generate Prompt Template"): |
|
|
gr.Markdown(""" |
|
|
## Create Customized Agent Prompt Template |
|
|
|
|
|
Generate a domain-specific prompt template based on smolagents templates. |
|
|
This template can be used with your synthetic dataset to run SMOLTRACE evaluations. |
|
|
|
|
|
**π― Use Case**: After generating a synthetic dataset, create a matching prompt template |
|
|
that agents can use during evaluation. This ensures your evaluation setup is complete. |
|
|
|
|
|
**Output**: Customized YAML prompt template ready for use with smolagents |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
prompt_domain = gr.Textbox( |
|
|
label="Domain", |
|
|
placeholder="e.g., finance, healthcare, customer_support", |
|
|
value="travel", |
|
|
info="The domain/industry for the prompt template" |
|
|
) |
|
|
prompt_tools = gr.Textbox( |
|
|
label="Tool Names (comma-separated)", |
|
|
placeholder="e.g., get_weather,search_flights,book_hotel", |
|
|
value="get_weather,search_flights,book_hotel", |
|
|
info="Names of tools the agent will use", |
|
|
lines=2 |
|
|
) |
|
|
prompt_agent_type = gr.Dropdown( |
|
|
label="Agent Type", |
|
|
choices=["tool", "code"], |
|
|
value="tool", |
|
|
info="ToolCallingAgent (tool) or CodeAgent (code)" |
|
|
) |
|
|
prompt_button = gr.Button("π Generate Prompt Template", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(): |
|
|
prompt_output = gr.JSON(label="Generated Prompt Template (JSON)") |
|
|
|
|
|
gr.Markdown(""" |
|
|
### π Next Steps |
|
|
|
|
|
After generation: |
|
|
1. **Copy the `prompt_template`** from the JSON output above |
|
|
2. **Save it as a YAML file** (e.g., `{domain}_agent.yaml`) |
|
|
3. **Include it in your HuggingFace dataset** card or repository |
|
|
4. **Use it with SMOLTRACE** when running evaluations |
|
|
|
|
|
**π‘ Tip**: This template is AI-customized for your domain and tools! |
|
|
""") |
|
|
|
|
|
async def run_generate_prompt_template(domain, tools, agent_type): |
|
|
"""Generate prompt template with async support.""" |
|
|
try: |
|
|
import json |
|
|
result = await generate_prompt_template( |
|
|
domain=domain, |
|
|
tool_names=tools, |
|
|
agent_type=agent_type |
|
|
) |
|
|
return json.loads(result) |
|
|
except Exception as e: |
|
|
return {"error": str(e)} |
|
|
|
|
|
prompt_button.click( |
|
|
fn=run_generate_prompt_template, |
|
|
inputs=[prompt_domain, prompt_tools, prompt_agent_type], |
|
|
outputs=[prompt_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π€ Push to Hub"): |
|
|
gr.Markdown(""" |
|
|
## Upload Generated Dataset to HuggingFace Hub |
|
|
|
|
|
Upload your synthetic dataset (from the previous tab or any SMOLTRACE-format dataset) |
|
|
directly to HuggingFace Hub. |
|
|
|
|
|
**Requirements**: |
|
|
- HuggingFace account |
|
|
- API token with write permissions ([Get one here](https://huggingface.co/settings/tokens)) |
|
|
- Dataset in SMOLTRACE format |
|
|
|
|
|
**Naming Convention**: `{username}/smoltrace-{domain}-tasks` or `{username}/smoltrace-{domain}-tasks-v1` |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
push_dataset_json = gr.Textbox( |
|
|
label="Dataset JSON (tasks array)", |
|
|
placeholder='[{"id": "task_001", "prompt": "...", "expected_tool": "...", ...}]', |
|
|
info="Paste the 'tasks' array from generate_synthetic_dataset output", |
|
|
lines=10 |
|
|
) |
|
|
push_repo_name = gr.Textbox( |
|
|
label="Repository Name", |
|
|
placeholder="your-username/smoltrace-finance-tasks", |
|
|
info="HuggingFace repo name (follow SMOLTRACE convention)", |
|
|
value="" |
|
|
) |
|
|
push_hf_token = gr.Textbox( |
|
|
label="HuggingFace Token", |
|
|
placeholder="hf_...", |
|
|
info="API token with write permissions", |
|
|
type="password" |
|
|
) |
|
|
push_private = gr.Checkbox( |
|
|
label="Make dataset private", |
|
|
value=False, |
|
|
info="Private datasets are only visible to you" |
|
|
) |
|
|
|
|
|
push_prompt_template = gr.Textbox( |
|
|
label="Prompt Template (Optional)", |
|
|
placeholder="Leave empty if not using prompt template", |
|
|
info="YAML prompt template to include in dataset card", |
|
|
lines=5, |
|
|
visible=True, |
|
|
value="" |
|
|
) |
|
|
push_button = gr.Button("π€ Push to HuggingFace Hub", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(): |
|
|
push_output = gr.JSON(label="Upload Result") |
|
|
|
|
|
gr.Markdown(""" |
|
|
### π After Upload |
|
|
|
|
|
Once uploaded, you can: |
|
|
1. **View your dataset** at the URL provided in the output |
|
|
2. **Use in SMOLTRACE** evaluations with the command shown |
|
|
3. **Share with your team** (if public) or manage access (if private) |
|
|
|
|
|
**Example**: After uploading to `company/smoltrace-finance-tasks`: |
|
|
```bash |
|
|
smoltrace-eval --model openai/gpt-4 --dataset-name company/smoltrace-finance-tasks |
|
|
``` |
|
|
""") |
|
|
|
|
|
async def run_push_dataset(dataset_json, repo_name, hf_token, private, prompt_template=""): |
|
|
"""Push dataset to hub with async support and optional prompt template.""" |
|
|
try: |
|
|
import json |
|
|
result = await push_dataset_to_hub( |
|
|
dataset_json=dataset_json, |
|
|
repo_name=repo_name, |
|
|
hf_token=hf_token, |
|
|
private=private, |
|
|
prompt_template=prompt_template if prompt_template else None |
|
|
) |
|
|
return json.loads(result) |
|
|
except Exception as e: |
|
|
return {"error": str(e)} |
|
|
|
|
|
push_button.click( |
|
|
fn=run_push_dataset, |
|
|
inputs=[push_dataset_json, push_repo_name, push_hf_token, push_private, push_prompt_template], |
|
|
outputs=[push_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π MCP Resources & Prompts"): |
|
|
gr.Markdown(""" |
|
|
## MCP Resources & Prompts |
|
|
|
|
|
Beyond the 7 MCP Tools, this server also exposes **MCP Resources** and **MCP Prompts** |
|
|
that MCP clients can use directly. |
|
|
|
|
|
### MCP Resources (Read-Only Data Access) |
|
|
|
|
|
Resources provide direct access to data without AI processing: |
|
|
|
|
|
#### 1. `leaderboard://{repo}` |
|
|
Get raw leaderboard data in JSON format. |
|
|
|
|
|
**Example**: `leaderboard://kshitijthakkar/smoltrace-leaderboard` |
|
|
|
|
|
**Returns**: JSON with all evaluation runs |
|
|
|
|
|
#### 2. `trace://{trace_id}/{repo}` |
|
|
Get raw trace data for a specific trace. |
|
|
|
|
|
**Example**: `trace://trace_abc123/kshitijthakkar/smoltrace-traces-gpt4` |
|
|
|
|
|
**Returns**: JSON with OpenTelemetry spans |
|
|
|
|
|
#### 3. `cost://model/{model_name}` |
|
|
Get cost information for a specific model. |
|
|
|
|
|
**Example**: `cost://model/openai/gpt-4` |
|
|
|
|
|
**Returns**: JSON with pricing data |
|
|
|
|
|
--- |
|
|
|
|
|
### MCP Prompts (Reusable Templates) |
|
|
|
|
|
Prompts provide standardized templates for common workflows: |
|
|
|
|
|
#### 1. `analysis_prompt(analysis_type, focus_area, detail_level)` |
|
|
Generate analysis prompt templates. |
|
|
|
|
|
**Parameters**: |
|
|
- `analysis_type`: "leaderboard", "trace", "cost" |
|
|
- `focus_area`: "overall", "performance", "cost", "efficiency" |
|
|
- `detail_level`: "summary", "detailed", "comprehensive" |
|
|
|
|
|
#### 2. `debug_prompt(debug_type, context)` |
|
|
Generate debugging prompt templates. |
|
|
|
|
|
**Parameters**: |
|
|
- `debug_type`: "error", "performance", "behavior", "optimization" |
|
|
- `context`: "agent_execution", "tool_calling", "llm_reasoning" |
|
|
|
|
|
#### 3. `optimization_prompt(optimization_goal, constraints)` |
|
|
Generate optimization prompt templates. |
|
|
|
|
|
**Parameters**: |
|
|
- `optimization_goal`: "cost", "speed", "quality", "efficiency" |
|
|
- `constraints`: "maintain_quality", "maintain_speed", "no_constraints" |
|
|
|
|
|
--- |
|
|
|
|
|
### Testing MCP Resources |
|
|
|
|
|
Test resources directly from this UI: |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("#### Test Leaderboard Resource") |
|
|
resource_lb_repo = gr.Textbox( |
|
|
label="Repository", |
|
|
value="kshitijthakkar/smoltrace-leaderboard" |
|
|
) |
|
|
resource_lb_button = gr.Button("Fetch Leaderboard Data", variant="primary") |
|
|
resource_lb_output = gr.JSON(label="Resource Output") |
|
|
|
|
|
def test_leaderboard_resource(repo): |
|
|
""" |
|
|
Test the leaderboard MCP resource by fetching raw leaderboard data. |
|
|
|
|
|
Args: |
|
|
repo (str): HuggingFace dataset repository name |
|
|
|
|
|
Returns: |
|
|
dict: JSON object with leaderboard data |
|
|
""" |
|
|
from mcp_tools import get_leaderboard_data |
|
|
import json |
|
|
result = get_leaderboard_data(repo) |
|
|
return json.loads(result) |
|
|
|
|
|
resource_lb_button.click( |
|
|
fn=test_leaderboard_resource, |
|
|
inputs=[resource_lb_repo], |
|
|
outputs=[resource_lb_output] |
|
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
gr.Markdown("#### Test Cost Resource") |
|
|
resource_cost_model = gr.Textbox( |
|
|
label="Model Name", |
|
|
value="openai/gpt-4" |
|
|
) |
|
|
resource_cost_button = gr.Button("Fetch Cost Data", variant="primary") |
|
|
resource_cost_output = gr.JSON(label="Resource Output") |
|
|
|
|
|
def test_cost_resource(model): |
|
|
""" |
|
|
Test the cost MCP resource by fetching model pricing data. |
|
|
|
|
|
Args: |
|
|
model (str): Model identifier (e.g., "openai/gpt-4") |
|
|
|
|
|
Returns: |
|
|
dict: JSON object with cost and pricing information |
|
|
""" |
|
|
from mcp_tools import get_cost_data |
|
|
import json |
|
|
result = get_cost_data(model) |
|
|
return json.loads(result) |
|
|
|
|
|
resource_cost_button.click( |
|
|
fn=test_cost_resource, |
|
|
inputs=[resource_cost_model], |
|
|
outputs=[resource_cost_output] |
|
|
) |
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("### Testing MCP Prompts") |
|
|
gr.Markdown("Generate prompt templates for different scenarios:") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
prompt_type = gr.Radio( |
|
|
label="Prompt Type", |
|
|
choices=["analysis_prompt", "debug_prompt", "optimization_prompt"], |
|
|
value="analysis_prompt" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Group(visible=True) as analysis_group: |
|
|
analysis_type = gr.Dropdown( |
|
|
label="Analysis Type", |
|
|
choices=["leaderboard", "trace", "cost"], |
|
|
value="leaderboard" |
|
|
) |
|
|
focus_area = gr.Dropdown( |
|
|
label="Focus Area", |
|
|
choices=["overall", "performance", "cost", "efficiency"], |
|
|
value="overall" |
|
|
) |
|
|
detail_level = gr.Dropdown( |
|
|
label="Detail Level", |
|
|
choices=["summary", "detailed", "comprehensive"], |
|
|
value="detailed" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Group(visible=False) as debug_group: |
|
|
debug_type = gr.Dropdown( |
|
|
label="Debug Type", |
|
|
choices=["error", "performance", "behavior", "optimization"], |
|
|
value="error" |
|
|
) |
|
|
debug_context = gr.Dropdown( |
|
|
label="Context", |
|
|
choices=["agent_execution", "tool_calling", "llm_reasoning"], |
|
|
value="agent_execution" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Group(visible=False) as optimization_group: |
|
|
optimization_goal = gr.Dropdown( |
|
|
label="Optimization Goal", |
|
|
choices=["cost", "speed", "quality", "efficiency"], |
|
|
value="cost" |
|
|
) |
|
|
constraints = gr.Dropdown( |
|
|
label="Constraints", |
|
|
choices=["maintain_quality", "maintain_speed", "no_constraints"], |
|
|
value="maintain_quality" |
|
|
) |
|
|
|
|
|
prompt_button = gr.Button("Generate Prompt", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
prompt_output = gr.Textbox( |
|
|
label="Generated Prompt Template", |
|
|
lines=10, |
|
|
max_lines=20 |
|
|
) |
|
|
|
|
|
def toggle_prompt_groups(prompt_type): |
|
|
""" |
|
|
Toggle visibility of prompt parameter groups based on selected prompt type. |
|
|
|
|
|
Args: |
|
|
prompt_type (str): The type of prompt selected |
|
|
|
|
|
Returns: |
|
|
dict: Gradio update objects for group visibility |
|
|
""" |
|
|
return { |
|
|
analysis_group: gr.update(visible=(prompt_type == "analysis_prompt")), |
|
|
debug_group: gr.update(visible=(prompt_type == "debug_prompt")), |
|
|
optimization_group: gr.update(visible=(prompt_type == "optimization_prompt")) |
|
|
} |
|
|
|
|
|
prompt_type.change( |
|
|
fn=toggle_prompt_groups, |
|
|
inputs=[prompt_type], |
|
|
outputs=[analysis_group, debug_group, optimization_group] |
|
|
) |
|
|
|
|
|
def generate_prompt( |
|
|
prompt_type, |
|
|
analysis_type_val, focus_area_val, detail_level_val, |
|
|
debug_type_val, debug_context_val, |
|
|
optimization_goal_val, constraints_val |
|
|
): |
|
|
""" |
|
|
Generate a prompt template based on the selected type and parameters. |
|
|
|
|
|
Args: |
|
|
prompt_type (str): Type of prompt to generate |
|
|
analysis_type_val (str): Analysis type parameter |
|
|
focus_area_val (str): Focus area parameter |
|
|
detail_level_val (str): Detail level parameter |
|
|
debug_type_val (str): Debug type parameter |
|
|
debug_context_val (str): Debug context parameter |
|
|
optimization_goal_val (str): Optimization goal parameter |
|
|
constraints_val (str): Constraints parameter |
|
|
|
|
|
Returns: |
|
|
str: Generated prompt template text |
|
|
""" |
|
|
from mcp_tools import analysis_prompt, debug_prompt, optimization_prompt |
|
|
|
|
|
if prompt_type == "analysis_prompt": |
|
|
return analysis_prompt(analysis_type_val, focus_area_val, detail_level_val) |
|
|
elif prompt_type == "debug_prompt": |
|
|
return debug_prompt(debug_type_val, debug_context_val) |
|
|
elif prompt_type == "optimization_prompt": |
|
|
return optimization_prompt(optimization_goal_val, constraints_val) |
|
|
|
|
|
prompt_button.click( |
|
|
fn=generate_prompt, |
|
|
inputs=[ |
|
|
prompt_type, |
|
|
analysis_type, focus_area, detail_level, |
|
|
debug_type, debug_context, |
|
|
optimization_goal, constraints |
|
|
], |
|
|
outputs=[prompt_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π API Documentation"): |
|
|
gr.Markdown(""" |
|
|
## MCP Tool Specifications |
|
|
|
|
|
Click on each tool to expand its documentation. |
|
|
|
|
|
<details> |
|
|
<summary><h3>π 1. analyze_leaderboard</h3></summary> |
|
|
|
|
|
**Description**: Generate AI-powered insights from evaluation leaderboard data |
|
|
|
|
|
**Parameters**: |
|
|
- `leaderboard_repo` (str): HuggingFace dataset repository (default: "kshitijthakkar/smoltrace-leaderboard") |
|
|
- `metric_focus` (str): "overall", "accuracy", "cost", "latency", or "co2" (default: "overall") |
|
|
- `time_range` (str): "last_week", "last_month", or "all_time" (default: "last_week") |
|
|
- `top_n` (int): Number of top models to highlight (default: 5, min: 3, max: 10) |
|
|
|
|
|
**Returns**: Markdown-formatted analysis with top performers, trends, and recommendations |
|
|
|
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><h3>π 2. debug_trace</h3></summary> |
|
|
|
|
|
**Description**: Answer questions about specific agent execution traces |
|
|
|
|
|
**Parameters**: |
|
|
- `trace_id` (str, required): Unique identifier for the trace |
|
|
- `traces_repo` (str, required): HuggingFace dataset repository with trace data |
|
|
- `question` (str): Specific question about the trace (default: "Analyze this trace and explain what happened") |
|
|
|
|
|
**Returns**: Markdown-formatted debug analysis with step-by-step breakdown |
|
|
|
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><h3>π° 3. estimate_cost</h3></summary> |
|
|
|
|
|
**Description**: Predict evaluation costs before running |
|
|
|
|
|
**Parameters**: |
|
|
- `model` (str, required): Model identifier in litellm format (e.g., "openai/gpt-4") |
|
|
- `agent_type` (str, required): "tool", "code", or "both" |
|
|
- `num_tests` (int): Number of test cases (default: 100, min: 10, max: 1000) |
|
|
- `hardware` (str): "auto", "cpu", "gpu_a10", or "gpu_h200" (default: "auto") |
|
|
|
|
|
**Returns**: Markdown-formatted cost estimate with breakdown and optimization tips |
|
|
|
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><h3>βοΈ 4. compare_runs</h3></summary> |
|
|
|
|
|
**Description**: Compare two evaluation runs with AI-powered analysis |
|
|
|
|
|
**Parameters**: |
|
|
- `run_id_1` (str, required): First run ID from the leaderboard |
|
|
- `run_id_2` (str, required): Second run ID to compare against |
|
|
- `leaderboard_repo` (str): HuggingFace dataset repository (default: "kshitijthakkar/smoltrace-leaderboard") |
|
|
- `comparison_focus` (str): "comprehensive", "cost", "performance", or "eco_friendly" (default: "comprehensive") |
|
|
|
|
|
**Returns**: Markdown-formatted comparative analysis with winner for each category, trade-offs, and recommendations |
|
|
|
|
|
**Focus Options**: |
|
|
- `comprehensive`: Complete comparison across all dimensions (success rate, cost, speed, CO2, GPU) |
|
|
- `cost`: Detailed cost efficiency analysis and ROI |
|
|
- `performance`: Speed and accuracy trade-off analysis |
|
|
- `eco_friendly`: Environmental impact and carbon footprint comparison |
|
|
|
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><h3>π 5. get_top_performers</h3></summary> |
|
|
|
|
|
**Description**: Get top performing models from leaderboard - optimized for quick queries |
|
|
|
|
|
**β‘ Performance**: This tool is **optimized** to avoid token bloat by returning only essential data for top performers instead of the full leaderboard (51 runs). |
|
|
|
|
|
**When to use**: Use this instead of `get_dataset()` when you need to answer questions like: |
|
|
- "Which model is leading?" |
|
|
- "Show me the top 5 models" |
|
|
- "What's the best model for cost?" |
|
|
|
|
|
**Parameters**: |
|
|
- `leaderboard_repo` (str): HuggingFace dataset repository (default: "kshitijthakkar/smoltrace-leaderboard") |
|
|
- `metric` (str): Metric to rank by (default: "success_rate") |
|
|
- Options: "success_rate", "total_cost_usd", "avg_duration_ms", "co2_emissions_g" |
|
|
- `top_n` (int): Number of top models to return (default: 5, range: 1-20) |
|
|
|
|
|
**Returns**: JSON object with top performers - **ready to use, no parsing needed** |
|
|
|
|
|
**Benefits vs get_dataset()**: |
|
|
- β
Returns only 5-20 runs instead of all 51 runs (90% token reduction) |
|
|
- β
Properly formatted JSON (no string conversion issues) |
|
|
- β
Pre-sorted by your chosen metric |
|
|
- β
Includes only essential columns (10 fields vs 20+ fields) |
|
|
|
|
|
**Example Response**: |
|
|
```json |
|
|
{ |
|
|
"metric_ranked_by": "success_rate", |
|
|
"ranking_order": "descending (higher is better)", |
|
|
"total_runs_in_leaderboard": 51, |
|
|
"top_n": 5, |
|
|
"top_performers": [ |
|
|
{ |
|
|
"run_id": "run_123", |
|
|
"model": "openai/gpt-4", |
|
|
"success_rate": 95.8, |
|
|
"total_cost_usd": 0.05, |
|
|
... |
|
|
} |
|
|
] |
|
|
} |
|
|
``` |
|
|
|
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><h3>π 6. get_leaderboard_summary</h3></summary> |
|
|
|
|
|
**Description**: Get high-level leaderboard summary statistics - optimized for overview queries |
|
|
|
|
|
**β‘ Performance**: This tool is **optimized** to return only summary statistics (no individual runs), avoiding the full dataset that causes token bloat. |
|
|
|
|
|
**When to use**: Use this instead of `get_dataset()` when you need to answer questions like: |
|
|
- "How many runs are in the leaderboard?" |
|
|
- "What's the average success rate?" |
|
|
- "Give me an overview of the leaderboard" |
|
|
|
|
|
**Parameters**: |
|
|
- `leaderboard_repo` (str): HuggingFace dataset repository (default: "kshitijthakkar/smoltrace-leaderboard") |
|
|
|
|
|
**Returns**: JSON object with summary statistics - **ready to use, no parsing needed** |
|
|
|
|
|
**Benefits vs get_dataset()**: |
|
|
- β
Returns aggregated stats instead of raw data (99% token reduction) |
|
|
- β
Properly formatted JSON (no string conversion issues) |
|
|
- β
Includes breakdowns by agent_type and provider |
|
|
- β
Shows top 3 models by success rate |
|
|
- β
Calculates averages, totals, and distributions |
|
|
|
|
|
**Example Response**: |
|
|
```json |
|
|
{ |
|
|
"leaderboard_repo": "kshitijthakkar/smoltrace-leaderboard", |
|
|
"summary": { |
|
|
"total_runs": 51, |
|
|
"unique_models": 15, |
|
|
"overall_stats": { |
|
|
"avg_success_rate": 89.5, |
|
|
"best_success_rate": 95.8, |
|
|
"avg_cost_per_run_usd": 0.023 |
|
|
}, |
|
|
"breakdown_by_agent_type": {...}, |
|
|
"top_3_models_by_success_rate": [...] |
|
|
} |
|
|
} |
|
|
``` |
|
|
|
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><h3>π¦ 7. get_dataset</h3></summary> |
|
|
|
|
|
**Description**: Load SMOLTRACE datasets from HuggingFace and return as JSON |
|
|
|
|
|
**β οΈ Note**: For leaderboard queries, prefer using `get_top_performers()` or `get_leaderboard_summary()` instead - they're optimized to avoid token bloat! |
|
|
|
|
|
**Parameters**: |
|
|
- `dataset_repo` (str, required): HuggingFace dataset repository path with "smoltrace-" prefix (e.g., "kshitijthakkar/smoltrace-leaderboard") |
|
|
- `max_rows` (int): Maximum number of rows to return (default: 50, range: 1-200) |
|
|
|
|
|
**Returns**: JSON object with dataset data and metadata |
|
|
|
|
|
**Restriction**: Only datasets with "smoltrace-" in the repository name are allowed for security. |
|
|
|
|
|
**Use Cases**: |
|
|
- Load smoltrace-results-* datasets to see individual test case details |
|
|
- Load smoltrace-traces-* datasets to access OpenTelemetry trace data |
|
|
- Load smoltrace-metrics-* datasets to get GPU metrics and performance data |
|
|
- For leaderboard: Use `get_top_performers()` or `get_leaderboard_summary()` instead! |
|
|
|
|
|
**Workflow**: |
|
|
1. Use `get_leaderboard_summary()` for overview questions |
|
|
2. Use `get_top_performers()` for "top N" queries |
|
|
3. Use `get_dataset()` only for non-leaderboard datasets or when you need specific run IDs |
|
|
|
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><h3>π§ͺ 8. generate_synthetic_dataset</h3></summary> |
|
|
|
|
|
**Description**: Generate domain-specific synthetic test datasets for SMOLTRACE evaluations using AI |
|
|
|
|
|
**Parameters**: |
|
|
- `domain` (str, required): The domain for synthetic tasks (e.g., "finance", "healthcare", "travel", "ecommerce", "customer_support") |
|
|
- `tool_names` (str, required): Comma-separated list of tool names to include (e.g., "get_weather,search_web,calculator") |
|
|
- `num_tasks` (int): Number of synthetic tasks to generate (default: 10, range: 5-100) |
|
|
- `difficulty_distribution` (str): How to distribute task difficulty (default: "balanced") |
|
|
- Options: "balanced" (40% easy, 40% medium, 20% hard), "easy_only", "medium_only", "hard_only", "progressive" (50% easy, 30% medium, 20% hard) |
|
|
- `agent_type` (str): Target agent type for tasks (default: "both") |
|
|
- Options: "tool" (ToolCallingAgent), "code" (CodeAgent), "both" (50/50 mix) |
|
|
|
|
|
**Returns**: JSON object with dataset_info (including batch statistics), tasks array (SMOLTRACE format), and usage_instructions |
|
|
|
|
|
**π Batched Generation**: |
|
|
- Requests >20 tasks are automatically split into parallel batches |
|
|
- Each batch generates up to 20 tasks concurrently |
|
|
- Example: 100 tasks = 5 parallel batches (20 tasks each) |
|
|
- Timeout: 120 seconds per batch |
|
|
- Token limit: 8,192 per batch (40,960 total for 100 tasks) |
|
|
|
|
|
**Performance**: |
|
|
- 5-20 tasks: Single batch, ~30-60 seconds |
|
|
- 21-100 tasks: Multiple parallel batches, ~60-120 seconds per batch |
|
|
|
|
|
**SMOLTRACE Task Format**: |
|
|
Each task includes: `id`, `prompt`, `expected_tool`, `expected_tool_calls` (optional), `difficulty`, `agent_type`, `expected_keywords` (optional) |
|
|
|
|
|
**Use Cases**: |
|
|
- Create custom evaluation datasets for industry-specific domains |
|
|
- Test agents with proprietary tools and APIs |
|
|
- Generate benchmarks for internal workflows |
|
|
- Rapid prototyping of evaluation scenarios |
|
|
|
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><h3>π€ 9. push_dataset_to_hub</h3></summary> |
|
|
|
|
|
**Description**: Push a generated synthetic dataset to HuggingFace Hub |
|
|
|
|
|
**Parameters**: |
|
|
- `dataset_json` (str, required): JSON string containing the tasks array from generate_synthetic_dataset |
|
|
- `repo_name` (str, required): HuggingFace repository name following SMOLTRACE naming convention |
|
|
- Format: `{username}/smoltrace-{domain}-tasks` or `{username}/smoltrace-{domain}-tasks-v{version}` |
|
|
- Examples: `kshitij/smoltrace-finance-tasks`, `kshitij/smoltrace-healthcare-tasks-v2` |
|
|
- `hf_token` (str, optional): HuggingFace API token with write permissions (uses saved token from Settings if not provided) |
|
|
- `private` (bool): Whether to create a private repository (default: False) |
|
|
|
|
|
**Returns**: JSON object with upload status, repository URL, and dataset information |
|
|
|
|
|
**Validation**: |
|
|
- β
Checks SMOLTRACE naming convention (`smoltrace-` prefix required) |
|
|
- β
Validates all tasks have required fields (id, prompt, expected_tool, difficulty, agent_type) |
|
|
- β
Verifies HuggingFace token has write permissions |
|
|
- β
Handles repository creation if it doesn't exist |
|
|
|
|
|
**Workflow**: |
|
|
1. Generate synthetic dataset using `generate_synthetic_dataset` |
|
|
2. Extract the `tasks` array from the response JSON |
|
|
3. Convert tasks array to JSON string |
|
|
4. Call `push_dataset_to_hub` with the JSON string and desired repo name |
|
|
5. Share the dataset URL with your team or use in SMOLTRACE evaluations |
|
|
|
|
|
**Example Integration**: |
|
|
```python |
|
|
# Step 1: Generate dataset |
|
|
result = generate_synthetic_dataset( |
|
|
domain="finance", |
|
|
tool_names="get_stock_price,calculate_roi,fetch_company_info", |
|
|
num_tasks=50 |
|
|
) |
|
|
|
|
|
# Step 2: Extract tasks |
|
|
import json |
|
|
data = json.loads(result) |
|
|
tasks_json = json.dumps(data["tasks"]) |
|
|
|
|
|
# Step 3: Push to HuggingFace |
|
|
push_result = push_dataset_to_hub( |
|
|
dataset_json=tasks_json, |
|
|
repo_name="your-username/smoltrace-finance-tasks", |
|
|
hf_token="hf_xxx", |
|
|
private=False |
|
|
) |
|
|
``` |
|
|
|
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><h3>π 10. analyze_results</h3></summary> |
|
|
|
|
|
**Description**: Analyzes detailed test results and provides optimization recommendations |
|
|
|
|
|
**Parameters**: |
|
|
- `results_repo` (str, required): HuggingFace dataset containing results |
|
|
- Format: `username/smoltrace-results-model-timestamp` |
|
|
- Must contain "smoltrace-results-" prefix |
|
|
- `analysis_focus` (str): Focus area for analysis (default: "comprehensive") |
|
|
- Options: "failures", "performance", "cost", "comprehensive" |
|
|
- `max_rows` (int): Maximum test cases to analyze (default: 100, range: 10-500) |
|
|
|
|
|
**Returns**: JSON object with AI analysis including: |
|
|
- Overall statistics (success rate, average duration, total cost) |
|
|
- Failure patterns and root causes |
|
|
- Performance bottlenecks in specific test cases |
|
|
- Cost optimization opportunities |
|
|
- Tool usage patterns |
|
|
- Task-specific insights (which types work well vs poorly) |
|
|
- Actionable optimization recommendations |
|
|
|
|
|
**Use Case**: |
|
|
After running an evaluation, analyze the detailed test results to understand why certain tests are failing and get specific recommendations for improving success rate. |
|
|
|
|
|
**Example**: |
|
|
```python |
|
|
result = analyze_results( |
|
|
results_repo="kshitij/smoltrace-results-gpt4-20251120", |
|
|
analysis_focus="failures", |
|
|
max_rows=100 |
|
|
) |
|
|
``` |
|
|
|
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><h3>π 11. generate_prompt_template</h3></summary> |
|
|
|
|
|
**Description**: Generate customized smolagents prompt template for a specific domain and tool set |
|
|
|
|
|
**Parameters**: |
|
|
- `domain` (str, required): Domain for the prompt template |
|
|
- Examples: "finance", "healthcare", "customer_support", "e-commerce" |
|
|
- `tool_names` (str, required): Comma-separated list of tool names |
|
|
- Format: "tool1,tool2,tool3" |
|
|
- Example: "get_stock_price,calculate_roi,fetch_company_info" |
|
|
- `agent_type` (str): Agent type (default: "tool") |
|
|
- Options: "tool" (ToolCallingAgent), "code" (CodeAgent) |
|
|
|
|
|
**Returns**: JSON object containing: |
|
|
- Customized YAML prompt template |
|
|
- Metadata (domain, tools, agent_type, timestamp) |
|
|
- Usage instructions |
|
|
|
|
|
**Use Case**: |
|
|
When you generate synthetic datasets with `generate_synthetic_dataset`, use this tool to create a matching prompt template that agents can use during evaluation. This ensures your evaluation setup is complete and ready to run. |
|
|
|
|
|
**Integration**: |
|
|
The generated prompt template can be included in your HuggingFace dataset card, making it easy for anyone to run evaluations with your dataset. |
|
|
|
|
|
**Example**: |
|
|
```python |
|
|
result = generate_prompt_template( |
|
|
domain="customer_support", |
|
|
tool_names="search_knowledge_base,create_ticket,send_email,escalate_to_human", |
|
|
agent_type="tool" |
|
|
) |
|
|
``` |
|
|
|
|
|
</details> |
|
|
|
|
|
--- |
|
|
|
|
|
## MCP Integration |
|
|
|
|
|
This Gradio app is MCP-enabled. When deployed to HuggingFace Spaces, it can be accessed via MCP clients. |
|
|
|
|
|
**HuggingFace Space**: `https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind-mcp-server` |
|
|
|
|
|
**π¬ Quick Demo (5 min)**: [Watch on Loom](https://www.loom.com/share/d4d0003f06fa4327b46ba5c081bdf835) |
|
|
|
|
|
**πΊ Full Demo (20 min)**: [Watch on Loom](https://www.loom.com/share/de559bb0aef749559c79117b7f951250) |
|
|
|
|
|
**MCP Endpoint (SSE - Recommended)**: `https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/sse` |
|
|
|
|
|
**MCP Endpoint (Streamable HTTP)**: `https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/` |
|
|
|
|
|
### What's Exposed via MCP: |
|
|
|
|
|
#### 11 MCP Tools (AI-Powered & Optimized) |
|
|
The eleven tools above (`analyze_leaderboard`, `debug_trace`, `estimate_cost`, `compare_runs`, `analyze_results`, `get_top_performers`, `get_leaderboard_summary`, `get_dataset`, `generate_synthetic_dataset`, `generate_prompt_template`, `push_dataset_to_hub`) |
|
|
are automatically exposed as MCP tools and can be called from any MCP client. |
|
|
|
|
|
#### 3 MCP Resources (Data Access) |
|
|
- `leaderboard://{repo}` - Raw leaderboard data |
|
|
- `trace://{trace_id}/{repo}` - Raw trace data |
|
|
- `cost://model/{model_name}` - Model pricing data |
|
|
|
|
|
#### 3 MCP Prompts (Templates) |
|
|
- `analysis_prompt(analysis_type, focus_area, detail_level)` - Analysis templates |
|
|
- `debug_prompt(debug_type, context)` - Debug templates |
|
|
- `optimization_prompt(optimization_goal, constraints)` - Optimization templates |
|
|
|
|
|
**See the "π MCP Resources & Prompts" tab to test these features.** |
|
|
""") |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
|
|
|
## Environment Variables |
|
|
|
|
|
Required: |
|
|
- `GEMINI_API_KEY`: Your Google Gemini API key |
|
|
- `HF_TOKEN`: Your HuggingFace token (for dataset access) |
|
|
|
|
|
## Source Code |
|
|
|
|
|
This server is part of the TraceMind project submission for MCP's 1st Birthday Hackathon. |
|
|
|
|
|
**Track 1**: Building MCP (Enterprise) |
|
|
**Tag**: `building-mcp-track-enterprise` |
|
|
""") |
|
|
|
|
|
with gr.Tab("βοΈ Settings"): |
|
|
|
|
|
current_gemini = os.environ.get("GEMINI_API_KEY", "") |
|
|
current_hf = os.environ.get("HF_TOKEN", "") |
|
|
|
|
|
gemini_display = "β
Configured" if current_gemini else "β Not configured" |
|
|
hf_display = "β
Configured" if current_hf else "β Not configured" |
|
|
|
|
|
gr.Markdown(f""" |
|
|
### API Configuration |
|
|
|
|
|
**Current Status**: Gemini API: {gemini_display} β’ HuggingFace Token: {hf_display} |
|
|
|
|
|
The server is pre-configured with API keys from HuggingFace Spaces Secrets. Optionally override with your own keys for this session. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
gemini_api_key_input = gr.Textbox( |
|
|
label="Google Gemini API Key (Optional)", |
|
|
placeholder="AIza...", |
|
|
type="password", |
|
|
value="", |
|
|
info="Free tier: 1,500 requests/day", |
|
|
scale=1 |
|
|
) |
|
|
hf_token_input = gr.Textbox( |
|
|
label="HuggingFace Token (Optional)", |
|
|
placeholder="hf_...", |
|
|
type="password", |
|
|
value="", |
|
|
info="Read or Write permissions", |
|
|
scale=1 |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
save_keys_btn = gr.Button("πΎ Apply Configuration", variant="primary", size="lg") |
|
|
reset_keys_btn = gr.Button("π Reset to Defaults", variant="secondary", size="lg") |
|
|
|
|
|
settings_status = gr.Markdown("") |
|
|
|
|
|
with gr.Accordion("π Setup Instructions", open=False): |
|
|
gr.Markdown(""" |
|
|
**Google Gemini API**: Get your key at [Google AI Studio](https://ai.google.dev/) (Free: 1,500 requests/day) |
|
|
|
|
|
**HuggingFace Token**: Create at [HuggingFace Settings](https://huggingface.co/settings/tokens) (Read or Write permissions) |
|
|
|
|
|
**Security**: Custom keys are session-only and cleared on page refresh. |
|
|
""") |
|
|
|
|
|
|
|
|
def save_override_keys(gemini, hf): |
|
|
"""Save user-provided API keys to session (override Spaces Secrets)""" |
|
|
results = [] |
|
|
|
|
|
if gemini and gemini.strip(): |
|
|
if gemini.startswith("AIza"): |
|
|
os.environ["GEMINI_API_KEY"] = gemini.strip() |
|
|
results.append("β
**Gemini API**: Configuration applied successfully") |
|
|
logger.info("Gemini API key overridden by user") |
|
|
else: |
|
|
results.append("β **Gemini API**: Invalid format (must start with 'AIza')") |
|
|
|
|
|
if hf and hf.strip(): |
|
|
if hf.startswith("hf_"): |
|
|
os.environ["HF_TOKEN"] = hf.strip() |
|
|
results.append("β
**HuggingFace Token**: Configuration applied successfully") |
|
|
logger.info("HuggingFace token overridden by user") |
|
|
else: |
|
|
results.append("β **HuggingFace Token**: Invalid format (must start with 'hf_')") |
|
|
|
|
|
if not results: |
|
|
return "βΉοΈ **No changes**: Empty fields submitted. Default configuration remains active." |
|
|
|
|
|
results.append("\n**Status**: Custom configuration active for this session.") |
|
|
return "\n\n".join(results) |
|
|
|
|
|
def reset_to_defaults(): |
|
|
"""Reset to Spaces Secrets (requires page refresh)""" |
|
|
return """ |
|
|
βΉοΈ **Reset Instructions** |
|
|
|
|
|
To restore default HuggingFace Spaces configuration: |
|
|
1. Refresh this page (F5 or Ctrl+R) |
|
|
2. Session overrides will be cleared automatically |
|
|
|
|
|
Default credentials will be active after refresh. |
|
|
""" |
|
|
|
|
|
|
|
|
save_keys_btn.click( |
|
|
fn=save_override_keys, |
|
|
inputs=[gemini_api_key_input, hf_token_input], |
|
|
outputs=[settings_status], |
|
|
api_name=False |
|
|
) |
|
|
|
|
|
reset_keys_btn.click( |
|
|
fn=reset_to_defaults, |
|
|
outputs=[settings_status], |
|
|
api_name=False |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
logger.info("=" * 70) |
|
|
logger.info("TraceMind MCP Server - HuggingFace Space (Track 1)") |
|
|
logger.info("=" * 70) |
|
|
logger.info("MCP Server: TraceMind Agent Evaluation Platform v1.0.0") |
|
|
logger.info("Protocol: Model Context Protocol (MCP)") |
|
|
logger.info("Transport: Gradio Native MCP Support (SSE)") |
|
|
logger.info("MCP Endpoint (SSE): https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/sse") |
|
|
logger.info("MCP Endpoint (HTTP): https://mcp-1st-birthday-tracemind-mcp-server.hf.space/gradio_api/mcp/") |
|
|
logger.info("=" * 70) |
|
|
logger.info("Features:") |
|
|
logger.info(" β 7 AI-Powered Tools (Leaderboard + Trace + Cost + Dataset)") |
|
|
logger.info(" β 3 Real-Time Resources (leaderboard, trace, cost data)") |
|
|
logger.info(" β 3 Prompt Templates (analysis, debug, optimization)") |
|
|
logger.info(" β Google Gemini 2.5 Flash - Intelligent Analysis") |
|
|
logger.info(" β HuggingFace Dataset Integration") |
|
|
logger.info(" β SMOLTRACE Format Support") |
|
|
logger.info(" β Synthetic Dataset Generation") |
|
|
logger.info("=" * 70) |
|
|
logger.info("Tool Categories:") |
|
|
logger.info(" π Analysis: analyze_leaderboard, compare_runs") |
|
|
logger.info(" π Debugging: debug_trace") |
|
|
logger.info(" π° Cost: estimate_cost") |
|
|
logger.info(" π¦ Data: get_dataset") |
|
|
logger.info(" π§ͺ Generation: generate_synthetic_dataset, push_dataset_to_hub") |
|
|
logger.info("=" * 70) |
|
|
logger.info("Compatible Clients:") |
|
|
logger.info(" β’ Claude Desktop") |
|
|
logger.info(" β’ Continue.dev (VS Code)") |
|
|
logger.info(" β’ Cline (VS Code)") |
|
|
logger.info(" β’ Any MCP-compatible client") |
|
|
logger.info("=" * 70) |
|
|
logger.info("How to Connect (Claude Desktop/HF MCP Client):") |
|
|
logger.info(" 1. Go to https://huggingface.co/settings/mcp") |
|
|
logger.info(" 2. Add Space: MCP-1st-Birthday/TraceMind-mcp-server") |
|
|
logger.info(" 3. Start using TraceMind tools in your MCP client!") |
|
|
logger.info("=" * 70) |
|
|
logger.info("Starting Gradio UI + MCP Server on 0.0.0.0:7860...") |
|
|
logger.info("Waiting for connections...") |
|
|
logger.info("=" * 70) |
|
|
|
|
|
try: |
|
|
|
|
|
demo = create_gradio_ui() |
|
|
|
|
|
|
|
|
theme = gr.themes.Base( |
|
|
primary_hue="indigo", |
|
|
secondary_hue="purple", |
|
|
neutral_hue="slate", |
|
|
font=gr.themes.GoogleFont("Inter"), |
|
|
).set( |
|
|
body_background_fill="*neutral_50", |
|
|
body_background_fill_dark="*neutral_900", |
|
|
button_primary_background_fill="*primary_500", |
|
|
button_primary_background_fill_hover="*primary_600", |
|
|
button_primary_text_color="white", |
|
|
) |
|
|
|
|
|
|
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
mcp_server=True, |
|
|
theme=theme |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to start server: {e}") |
|
|
logger.error("Check that:") |
|
|
logger.error(" 1. GEMINI_API_KEY environment variable is set") |
|
|
logger.error(" 2. Port 7860 is available") |
|
|
logger.error(" 3. All dependencies are installed") |
|
|
raise |
|
|
|