🧠 TraceMind Agent Evaluation Leaderboard

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}

🏆 Top Performers

""" # Top models medals = ["🥇", "🥈", "🥉", "4️⃣", "5️⃣"] for idx, (_, row) in enumerate(top_models.iterrows()): if idx >= top_n: break model_name = row['model'].split('/')[-1] if '/' in str(row['model']) else str(row['model']) html += f"""

{medals[idx]}

{model_name}

""" # Add metrics if 'success_rate' in row and pd.notna(row['success_rate']): html += f'✓ {row["success_rate"]:.1f}% Success Rate' if 'avg_duration_ms' in row and pd.notna(row['avg_duration_ms']): duration_s = row['avg_duration_ms'] / 1000 html += f'⚡ {duration_s:.1f}s Avg Duration' if 'total_cost_usd' in row and pd.notna(row['total_cost_usd']): html += f'💰 ${row["total_cost_usd"]:.4f} per run' # Add GPU metrics if available if 'co2_emissions_g' in row and pd.notna(row['co2_emissions_g']): html += f'🌱 {row["co2_emissions_g"]:.2f}g CO2' if 'gpu_utilization_avg' in row and pd.notna(row['gpu_utilization_avg']): html += f'🎮 {row["gpu_utilization_avg"]:.1f}% GPU Util' html += """

""" # Aggregate stats total_runs = len(df) unique_models = df['model'].nunique() if 'model' in df.columns else 0 avg_success = df['success_rate'].mean() if 'success_rate' in df.columns else 0 html += f"""

📊 Leaderboard Stats

• {total_runs} total evaluation runs
• {unique_models} unique models tested
• {avg_success:.1f}% average success rate
• ${total_cost:.2f} total evaluation cost
• {total_co2:.2f}g total CO2 emissions

🔗 tracemind @ HuggingFace

Built with TraceMind • Powered by SmolTrace & TraceVerde

{f'

' if logo_base64 else ''}

🤖 {model_display} Evaluation Report

Run ID: {run_id}

{timestamp}

""" # Success rate visualization success_rate = run_data.get('success_rate', 0) stars = "⭐" * int(success_rate / 20) # 5 stars max html += f"""

{stars}

{success_rate:.1f}% Success Rate

""" # Performance metrics html += """

📊 Performance Metrics

Tests: {run_data["successful_tests"]}/{run_data["total_tests"]} passed
Avg Steps: {run_data["avg_steps"]:.1f} per test
Avg Duration: {duration_s:.1f}s
Total Duration: {mins}m {secs}s

""" # Cost analysis if 'total_tokens' in run_data or 'total_cost_usd' in run_data: html += """

💰 Cost Analysis

Total Tokens: {run_data["total_tokens"]:,}
Total Cost: ${run_data["total_cost_usd"]:.4f}
Cost per Test: ${run_data["avg_cost_per_test_usd"]:.6f}

""" # Sustainability if 'co2_emissions_g' in run_data or 'provider' in run_data: html += """

🌱 Sustainability

CO2 Emissions: {run_data["co2_emissions_g"]:.2f}g
Provider: {run_data["provider"]} ({provider_label})
GPU Utilization: {run_data["gpu_utilization_avg"]:.1f}%

""" # Footer html += f"""

🔗 View detailed traces at tracemind.huggingface.co

{f'

' if logo_base64 else ''}

⚖️ Model Comparison Report

{model_a} vs {model_b}

{datetime.now().strftime('%Y-%m-%d %H:%M')}

{'🏆' * 5}

Overall Winner: Run {overall_winner} ({a_wins if overall_winner == "A" else b_wins}/4 categories)

Run A: {model_a}

{'✅' if success_winner == "A" else '📊'} Success: {run_a_data.get('success_rate', 0):.1f}%

{'✅' if cost_winner == "A" else '💰'} Cost: ${run_a_data.get('total_cost_usd', 0):.4f}

{'✅' if speed_winner == "A" else '⚡'} Speed: {run_a_data.get('avg_duration_ms', 0)/1000:.2f}s

{'✅' if eco_winner == "A" else '🌱'} CO2: {run_a_data.get('co2_emissions_g', 0):.2f}g

Run B: {model_b}

{'✅' if success_winner == "B" else '📊'} Success: {run_b_data.get('success_rate', 0):.1f}%

{'✅' if cost_winner == "B" else '💰'} Cost: ${run_b_data.get('total_cost_usd', 0):.4f}

{'✅' if speed_winner == "B" else '⚡'} Speed: {run_b_data.get('avg_duration_ms', 0)/1000:.2f}s

{'✅' if eco_winner == "B" else '🌱'} CO2: {run_b_data.get('co2_emissions_g', 0):.2f}g

💡 Recommendation

{f"Run {overall_winner} ({model_a if overall_winner == 'A' else model_b}) is recommended for most use cases" if overall_winner != "Tie" else "Both runs are evenly matched - choose based on your specific priorities"}

🔗 View detailed comparison at tracemind.huggingface.co