feat: Use real pricing data from genai_otel + Modal compute costs
Browse filesMajor improvements to cost estimation accuracy:
1. LLM Pricing:
- Fetch real pricing from genai_otel project's pricing database
- 200+ models with accurate input/output token costs
- Fuzzy matching for model name variations
- Graceful fallback for unknown models
2. Modal Compute Costs:
- Updated to actual per-second rates from Modal website
- Support for all GPU tiers: B200, H200, H100, A100 (40/80GB), L40S, A10, L4, T4
- Accurate CPU core and memory pricing
- Proper per-second calculation instead of hourly estimates
3. Enhanced Data:
- Added pricing source metadata
- Include model pricing details in response
- Higher precision (6 decimal places) for accurate cost tracking
- Better CO2 estimates per GPU type
This provides accurate real-world cost estimates for any model in the database.
- mcp_tools.py +91 -33
|
@@ -282,6 +282,20 @@ async def estimate_cost(
|
|
| 282 |
try:
|
| 283 |
# Initialize Gemini client with provided key or from environment
|
| 284 |
gemini_client = GeminiClient()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
# Determine if API or local model
|
| 286 |
is_api_model = any(provider in model.lower() for provider in ["openai", "anthropic", "google", "cohere"])
|
| 287 |
|
|
@@ -289,24 +303,46 @@ async def estimate_cost(
|
|
| 289 |
if hardware == "auto":
|
| 290 |
hardware = "cpu" if is_api_model else "gpu_a10"
|
| 291 |
|
| 292 |
-
#
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
"
|
| 296 |
-
"
|
| 297 |
-
"
|
| 298 |
-
"
|
| 299 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
}
|
| 301 |
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
"gpu_a10": 1.10, # per hour
|
| 305 |
-
"gpu_h200": 4.50 # per hour
|
| 306 |
-
}
|
| 307 |
|
| 308 |
-
#
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
# Estimate token usage per test
|
| 312 |
# Tool agent: ~200 tokens input, ~150 output
|
|
@@ -320,10 +356,10 @@ async def estimate_cost(
|
|
| 320 |
|
| 321 |
tokens_per_test = token_estimates[agent_type]
|
| 322 |
|
| 323 |
-
# Calculate LLM costs
|
| 324 |
llm_cost_per_test = (
|
| 325 |
-
|
| 326 |
-
|
| 327 |
)
|
| 328 |
total_llm_cost = llm_cost_per_test * num_tests
|
| 329 |
|
|
@@ -333,20 +369,34 @@ async def estimate_cost(
|
|
| 333 |
else:
|
| 334 |
duration_per_test = 8.0 # Local models slower but depends on GPU
|
| 335 |
|
| 336 |
-
|
| 337 |
|
| 338 |
-
# Calculate
|
| 339 |
-
|
| 340 |
-
total_jobs_cost = total_duration_hours * jobs_hourly_rate
|
| 341 |
|
| 342 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
co2_per_hour = {
|
| 344 |
-
"cpu": 0.05,
|
|
|
|
|
|
|
| 345 |
"gpu_a10": 0.15,
|
| 346 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
}
|
| 348 |
|
| 349 |
-
total_co2_kg =
|
| 350 |
|
| 351 |
# Prepare estimate data
|
| 352 |
estimate_data = {
|
|
@@ -355,15 +405,23 @@ async def estimate_cost(
|
|
| 355 |
"num_tests": num_tests,
|
| 356 |
"hardware": hardware,
|
| 357 |
"is_api_model": is_api_model,
|
|
|
|
| 358 |
"estimates": {
|
| 359 |
-
"llm_cost_usd": round(total_llm_cost,
|
| 360 |
-
"llm_cost_per_test": round(llm_cost_per_test,
|
| 361 |
-
"
|
| 362 |
-
"total_cost_usd": round(total_llm_cost +
|
| 363 |
-
"
|
|
|
|
| 364 |
"duration_per_test_seconds": round(duration_per_test, 2),
|
| 365 |
-
"co2_emissions_kg": round(total_co2_kg,
|
| 366 |
-
"tokens_per_test": tokens_per_test
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
}
|
| 368 |
}
|
| 369 |
|
|
|
|
| 282 |
try:
|
| 283 |
# Initialize Gemini client with provided key or from environment
|
| 284 |
gemini_client = GeminiClient()
|
| 285 |
+
|
| 286 |
+
# Fetch LLM pricing from genai_otel project
|
| 287 |
+
import requests
|
| 288 |
+
pricing_url = "https://raw.githubusercontent.com/Mandark-droid/genai_otel_instrument/refs/heads/main/genai_otel/llm_pricing.json"
|
| 289 |
+
|
| 290 |
+
try:
|
| 291 |
+
response = requests.get(pricing_url, timeout=5)
|
| 292 |
+
response.raise_for_status()
|
| 293 |
+
llm_pricing_db = response.json()
|
| 294 |
+
print(f"[INFO] Loaded {len(llm_pricing_db)} models from pricing database")
|
| 295 |
+
except Exception as e:
|
| 296 |
+
print(f"[WARNING] Failed to load pricing database: {e}, using fallback")
|
| 297 |
+
llm_pricing_db = {}
|
| 298 |
+
|
| 299 |
# Determine if API or local model
|
| 300 |
is_api_model = any(provider in model.lower() for provider in ["openai", "anthropic", "google", "cohere"])
|
| 301 |
|
|
|
|
| 303 |
if hardware == "auto":
|
| 304 |
hardware = "cpu" if is_api_model else "gpu_a10"
|
| 305 |
|
| 306 |
+
# Modal compute costs (per second) - from Modal website
|
| 307 |
+
modal_compute_costs = {
|
| 308 |
+
# GPU Tasks
|
| 309 |
+
"gpu_b200": 0.001736, # Nvidia B200
|
| 310 |
+
"gpu_h200": 0.001261, # Nvidia H200
|
| 311 |
+
"gpu_h100": 0.001097, # Nvidia H100
|
| 312 |
+
"gpu_a100_80gb": 0.000694, # Nvidia A100, 80 GB
|
| 313 |
+
"gpu_a100": 0.000583, # Nvidia A100, 40 GB
|
| 314 |
+
"gpu_l40s": 0.000542, # Nvidia L40S
|
| 315 |
+
"gpu_a10": 0.000306, # Nvidia A10
|
| 316 |
+
"gpu_l4": 0.000222, # Nvidia L4
|
| 317 |
+
"gpu_t4": 0.000164, # Nvidia T4
|
| 318 |
+
# CPU (per core)
|
| 319 |
+
"cpu": 0.0000131, # Physical core (2 vCPU equivalent)
|
| 320 |
+
# Memory (per GiB)
|
| 321 |
+
"memory": 0.00000222 # Per GiB
|
| 322 |
}
|
| 323 |
|
| 324 |
+
# Get model costs from pricing database
|
| 325 |
+
model_cost = None
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
+
# Try exact match first
|
| 328 |
+
if model in llm_pricing_db:
|
| 329 |
+
model_cost = llm_pricing_db[model]
|
| 330 |
+
else:
|
| 331 |
+
# Try without provider prefix (e.g., "gpt-4" instead of "openai/gpt-4")
|
| 332 |
+
model_name = model.split('/')[-1]
|
| 333 |
+
for key in llm_pricing_db:
|
| 334 |
+
if model_name in key or key in model_name:
|
| 335 |
+
model_cost = llm_pricing_db[key]
|
| 336 |
+
print(f"[INFO] Found pricing for {model} via fuzzy match: {key}")
|
| 337 |
+
break
|
| 338 |
+
|
| 339 |
+
# Fallback to default if not found
|
| 340 |
+
if model_cost is None:
|
| 341 |
+
print(f"[WARNING] Model {model} not in pricing database, using default")
|
| 342 |
+
if is_api_model:
|
| 343 |
+
model_cost = {"input_cost_per_token": 0.000001, "output_cost_per_token": 0.000002}
|
| 344 |
+
else:
|
| 345 |
+
model_cost = {"input_cost_per_token": 0, "output_cost_per_token": 0} # Local model
|
| 346 |
|
| 347 |
# Estimate token usage per test
|
| 348 |
# Tool agent: ~200 tokens input, ~150 output
|
|
|
|
| 356 |
|
| 357 |
tokens_per_test = token_estimates[agent_type]
|
| 358 |
|
| 359 |
+
# Calculate LLM costs (pricing is per token, not per 1K tokens)
|
| 360 |
llm_cost_per_test = (
|
| 361 |
+
tokens_per_test["input"] * model_cost.get("input_cost_per_token", 0) +
|
| 362 |
+
tokens_per_test["output"] * model_cost.get("output_cost_per_token", 0)
|
| 363 |
)
|
| 364 |
total_llm_cost = llm_cost_per_test * num_tests
|
| 365 |
|
|
|
|
| 369 |
else:
|
| 370 |
duration_per_test = 8.0 # Local models slower but depends on GPU
|
| 371 |
|
| 372 |
+
total_duration_seconds = duration_per_test * num_tests
|
| 373 |
|
| 374 |
+
# Calculate Modal compute costs (per second)
|
| 375 |
+
compute_rate_per_sec = modal_compute_costs.get(hardware, modal_compute_costs["cpu"])
|
|
|
|
| 376 |
|
| 377 |
+
# For CPU, estimate core usage (assume 2 cores for agent workload)
|
| 378 |
+
# For GPU, direct cost
|
| 379 |
+
if hardware == "cpu":
|
| 380 |
+
num_cores = 2 # Estimate 2 cores for typical agent workload
|
| 381 |
+
total_compute_cost = total_duration_seconds * compute_rate_per_sec * num_cores
|
| 382 |
+
else:
|
| 383 |
+
total_compute_cost = total_duration_seconds * compute_rate_per_sec
|
| 384 |
+
|
| 385 |
+
# Estimate CO2 (rough estimates in kg per hour)
|
| 386 |
co2_per_hour = {
|
| 387 |
+
"cpu": 0.05,
|
| 388 |
+
"gpu_t4": 0.10,
|
| 389 |
+
"gpu_l4": 0.12,
|
| 390 |
"gpu_a10": 0.15,
|
| 391 |
+
"gpu_l40s": 0.20,
|
| 392 |
+
"gpu_a100": 0.25,
|
| 393 |
+
"gpu_a100_80gb": 0.28,
|
| 394 |
+
"gpu_h100": 0.30,
|
| 395 |
+
"gpu_h200": 0.32,
|
| 396 |
+
"gpu_b200": 0.35
|
| 397 |
}
|
| 398 |
|
| 399 |
+
total_co2_kg = (total_duration_seconds / 3600) * co2_per_hour.get(hardware, 0.05)
|
| 400 |
|
| 401 |
# Prepare estimate data
|
| 402 |
estimate_data = {
|
|
|
|
| 405 |
"num_tests": num_tests,
|
| 406 |
"hardware": hardware,
|
| 407 |
"is_api_model": is_api_model,
|
| 408 |
+
"pricing_source": "genai_otel pricing database + Modal compute costs",
|
| 409 |
"estimates": {
|
| 410 |
+
"llm_cost_usd": round(total_llm_cost, 6),
|
| 411 |
+
"llm_cost_per_test": round(llm_cost_per_test, 6),
|
| 412 |
+
"compute_cost_usd": round(total_compute_cost, 6),
|
| 413 |
+
"total_cost_usd": round(total_llm_cost + total_compute_cost, 6),
|
| 414 |
+
"duration_seconds": round(total_duration_seconds, 2),
|
| 415 |
+
"duration_minutes": round(total_duration_seconds / 60, 2),
|
| 416 |
"duration_per_test_seconds": round(duration_per_test, 2),
|
| 417 |
+
"co2_emissions_kg": round(total_co2_kg, 4),
|
| 418 |
+
"tokens_per_test": tokens_per_test,
|
| 419 |
+
"compute_rate_per_second": compute_rate_per_sec
|
| 420 |
+
},
|
| 421 |
+
"model_pricing": {
|
| 422 |
+
"input_cost_per_token": model_cost.get("input_cost_per_token", 0),
|
| 423 |
+
"output_cost_per_token": model_cost.get("output_cost_per_token", 0),
|
| 424 |
+
"found_in_database": model in llm_pricing_db
|
| 425 |
}
|
| 426 |
}
|
| 427 |
|