kshitijthakkar commited on
Commit
3001796
·
1 Parent(s): 02e6639

feat: Use real pricing data from genai_otel + Modal compute costs

Browse files

Major improvements to cost estimation accuracy:

1. LLM Pricing:
- Fetch real pricing from genai_otel project's pricing database
- 200+ models with accurate input/output token costs
- Fuzzy matching for model name variations
- Graceful fallback for unknown models

2. Modal Compute Costs:
- Updated to actual per-second rates from Modal website
- Support for all GPU tiers: B200, H200, H100, A100 (40/80GB), L40S, A10, L4, T4
- Accurate CPU core and memory pricing
- Proper per-second calculation instead of hourly estimates

3. Enhanced Data:
- Added pricing source metadata
- Include model pricing details in response
- Higher precision (6 decimal places) for accurate cost tracking
- Better CO2 estimates per GPU type

This provides accurate real-world cost estimates for any model in the database.

Files changed (1) hide show
  1. mcp_tools.py +91 -33
mcp_tools.py CHANGED
@@ -282,6 +282,20 @@ async def estimate_cost(
282
  try:
283
  # Initialize Gemini client with provided key or from environment
284
  gemini_client = GeminiClient()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  # Determine if API or local model
286
  is_api_model = any(provider in model.lower() for provider in ["openai", "anthropic", "google", "cohere"])
287
 
@@ -289,24 +303,46 @@ async def estimate_cost(
289
  if hardware == "auto":
290
  hardware = "cpu" if is_api_model else "gpu_a10"
291
 
292
- # Cost data (simplified estimates)
293
- llm_costs = {
294
- "openai/gpt-4": {"input": 0.03, "output": 0.06}, # per 1K tokens
295
- "openai/gpt-3.5-turbo": {"input": 0.0015, "output": 0.002},
296
- "anthropic/claude-3-opus": {"input": 0.015, "output": 0.075},
297
- "anthropic/claude-3-sonnet": {"input": 0.003, "output": 0.015},
298
- "meta-llama/Llama-3.1-8B": {"input": 0, "output": 0}, # Local model
299
- "default": {"input": 0.001, "output": 0.002}
 
 
 
 
 
 
 
 
300
  }
301
 
302
- hf_jobs_costs = {
303
- "cpu": 0.60, # per hour
304
- "gpu_a10": 1.10, # per hour
305
- "gpu_h200": 4.50 # per hour
306
- }
307
 
308
- # Get model costs
309
- model_cost = llm_costs.get(model, llm_costs["default"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
  # Estimate token usage per test
312
  # Tool agent: ~200 tokens input, ~150 output
@@ -320,10 +356,10 @@ async def estimate_cost(
320
 
321
  tokens_per_test = token_estimates[agent_type]
322
 
323
- # Calculate LLM costs
324
  llm_cost_per_test = (
325
- (tokens_per_test["input"] / 1000) * model_cost["input"] +
326
- (tokens_per_test["output"] / 1000) * model_cost["output"]
327
  )
328
  total_llm_cost = llm_cost_per_test * num_tests
329
 
@@ -333,20 +369,34 @@ async def estimate_cost(
333
  else:
334
  duration_per_test = 8.0 # Local models slower but depends on GPU
335
 
336
- total_duration_hours = (duration_per_test * num_tests) / 3600
337
 
338
- # Calculate HF Jobs costs
339
- jobs_hourly_rate = hf_jobs_costs.get(hardware, hf_jobs_costs["cpu"])
340
- total_jobs_cost = total_duration_hours * jobs_hourly_rate
341
 
342
- # Estimate CO2 (rough estimates)
 
 
 
 
 
 
 
 
343
  co2_per_hour = {
344
- "cpu": 0.05, # kg CO2
 
 
345
  "gpu_a10": 0.15,
346
- "gpu_h200": 0.30
 
 
 
 
 
347
  }
348
 
349
- total_co2_kg = total_duration_hours * co2_per_hour.get(hardware, 0.05)
350
 
351
  # Prepare estimate data
352
  estimate_data = {
@@ -355,15 +405,23 @@ async def estimate_cost(
355
  "num_tests": num_tests,
356
  "hardware": hardware,
357
  "is_api_model": is_api_model,
 
358
  "estimates": {
359
- "llm_cost_usd": round(total_llm_cost, 4),
360
- "llm_cost_per_test": round(llm_cost_per_test, 4),
361
- "jobs_cost_usd": round(total_jobs_cost, 4),
362
- "total_cost_usd": round(total_llm_cost + total_jobs_cost, 4),
363
- "duration_hours": round(total_duration_hours, 2),
 
364
  "duration_per_test_seconds": round(duration_per_test, 2),
365
- "co2_emissions_kg": round(total_co2_kg, 3),
366
- "tokens_per_test": tokens_per_test
 
 
 
 
 
 
367
  }
368
  }
369
 
 
282
  try:
283
  # Initialize Gemini client with provided key or from environment
284
  gemini_client = GeminiClient()
285
+
286
+ # Fetch LLM pricing from genai_otel project
287
+ import requests
288
+ pricing_url = "https://raw.githubusercontent.com/Mandark-droid/genai_otel_instrument/refs/heads/main/genai_otel/llm_pricing.json"
289
+
290
+ try:
291
+ response = requests.get(pricing_url, timeout=5)
292
+ response.raise_for_status()
293
+ llm_pricing_db = response.json()
294
+ print(f"[INFO] Loaded {len(llm_pricing_db)} models from pricing database")
295
+ except Exception as e:
296
+ print(f"[WARNING] Failed to load pricing database: {e}, using fallback")
297
+ llm_pricing_db = {}
298
+
299
  # Determine if API or local model
300
  is_api_model = any(provider in model.lower() for provider in ["openai", "anthropic", "google", "cohere"])
301
 
 
303
  if hardware == "auto":
304
  hardware = "cpu" if is_api_model else "gpu_a10"
305
 
306
+ # Modal compute costs (per second) - from Modal website
307
+ modal_compute_costs = {
308
+ # GPU Tasks
309
+ "gpu_b200": 0.001736, # Nvidia B200
310
+ "gpu_h200": 0.001261, # Nvidia H200
311
+ "gpu_h100": 0.001097, # Nvidia H100
312
+ "gpu_a100_80gb": 0.000694, # Nvidia A100, 80 GB
313
+ "gpu_a100": 0.000583, # Nvidia A100, 40 GB
314
+ "gpu_l40s": 0.000542, # Nvidia L40S
315
+ "gpu_a10": 0.000306, # Nvidia A10
316
+ "gpu_l4": 0.000222, # Nvidia L4
317
+ "gpu_t4": 0.000164, # Nvidia T4
318
+ # CPU (per core)
319
+ "cpu": 0.0000131, # Physical core (2 vCPU equivalent)
320
+ # Memory (per GiB)
321
+ "memory": 0.00000222 # Per GiB
322
  }
323
 
324
+ # Get model costs from pricing database
325
+ model_cost = None
 
 
 
326
 
327
+ # Try exact match first
328
+ if model in llm_pricing_db:
329
+ model_cost = llm_pricing_db[model]
330
+ else:
331
+ # Try without provider prefix (e.g., "gpt-4" instead of "openai/gpt-4")
332
+ model_name = model.split('/')[-1]
333
+ for key in llm_pricing_db:
334
+ if model_name in key or key in model_name:
335
+ model_cost = llm_pricing_db[key]
336
+ print(f"[INFO] Found pricing for {model} via fuzzy match: {key}")
337
+ break
338
+
339
+ # Fallback to default if not found
340
+ if model_cost is None:
341
+ print(f"[WARNING] Model {model} not in pricing database, using default")
342
+ if is_api_model:
343
+ model_cost = {"input_cost_per_token": 0.000001, "output_cost_per_token": 0.000002}
344
+ else:
345
+ model_cost = {"input_cost_per_token": 0, "output_cost_per_token": 0} # Local model
346
 
347
  # Estimate token usage per test
348
  # Tool agent: ~200 tokens input, ~150 output
 
356
 
357
  tokens_per_test = token_estimates[agent_type]
358
 
359
+ # Calculate LLM costs (pricing is per token, not per 1K tokens)
360
  llm_cost_per_test = (
361
+ tokens_per_test["input"] * model_cost.get("input_cost_per_token", 0) +
362
+ tokens_per_test["output"] * model_cost.get("output_cost_per_token", 0)
363
  )
364
  total_llm_cost = llm_cost_per_test * num_tests
365
 
 
369
  else:
370
  duration_per_test = 8.0 # Local models slower but depends on GPU
371
 
372
+ total_duration_seconds = duration_per_test * num_tests
373
 
374
+ # Calculate Modal compute costs (per second)
375
+ compute_rate_per_sec = modal_compute_costs.get(hardware, modal_compute_costs["cpu"])
 
376
 
377
+ # For CPU, estimate core usage (assume 2 cores for agent workload)
378
+ # For GPU, direct cost
379
+ if hardware == "cpu":
380
+ num_cores = 2 # Estimate 2 cores for typical agent workload
381
+ total_compute_cost = total_duration_seconds * compute_rate_per_sec * num_cores
382
+ else:
383
+ total_compute_cost = total_duration_seconds * compute_rate_per_sec
384
+
385
+ # Estimate CO2 (rough estimates in kg per hour)
386
  co2_per_hour = {
387
+ "cpu": 0.05,
388
+ "gpu_t4": 0.10,
389
+ "gpu_l4": 0.12,
390
  "gpu_a10": 0.15,
391
+ "gpu_l40s": 0.20,
392
+ "gpu_a100": 0.25,
393
+ "gpu_a100_80gb": 0.28,
394
+ "gpu_h100": 0.30,
395
+ "gpu_h200": 0.32,
396
+ "gpu_b200": 0.35
397
  }
398
 
399
+ total_co2_kg = (total_duration_seconds / 3600) * co2_per_hour.get(hardware, 0.05)
400
 
401
  # Prepare estimate data
402
  estimate_data = {
 
405
  "num_tests": num_tests,
406
  "hardware": hardware,
407
  "is_api_model": is_api_model,
408
+ "pricing_source": "genai_otel pricing database + Modal compute costs",
409
  "estimates": {
410
+ "llm_cost_usd": round(total_llm_cost, 6),
411
+ "llm_cost_per_test": round(llm_cost_per_test, 6),
412
+ "compute_cost_usd": round(total_compute_cost, 6),
413
+ "total_cost_usd": round(total_llm_cost + total_compute_cost, 6),
414
+ "duration_seconds": round(total_duration_seconds, 2),
415
+ "duration_minutes": round(total_duration_seconds / 60, 2),
416
  "duration_per_test_seconds": round(duration_per_test, 2),
417
+ "co2_emissions_kg": round(total_co2_kg, 4),
418
+ "tokens_per_test": tokens_per_test,
419
+ "compute_rate_per_second": compute_rate_per_sec
420
+ },
421
+ "model_pricing": {
422
+ "input_cost_per_token": model_cost.get("input_cost_per_token", 0),
423
+ "output_cost_per_token": model_cost.get("output_cost_per_token", 0),
424
+ "found_in_database": model in llm_pricing_db
425
  }
426
  }
427