kshitijthakkar commited on
Commit
83ebb04
·
1 Parent(s): 8c679b3

feat: Complete Modal integration and fix cost estimation

Browse files

- Fixed Modal GPU job execution with required packages (hf_transfer, nvidia-ml-py)
- Updated to latest non-deprecated CUDA image (12.6.0-cudnn-devel)
- Made Python version dynamic to match environment (HF Space uses 3.10)
- Added streaming output for real-time progress visibility in Modal logs
- Improved logging with GPU info, download progress indicators
- Fixed cost estimation to show actual hardware for both Modal and HF Jobs
- Auto-selection now displays: 'auto → **A100-80GB** (Modal)' or 'auto → **a10g-large** (HF Jobs)'
- Cost estimates now match actual job hardware selection
- Updated job submission instructions with realistic duration estimates

Files changed (3) hide show
  1. app.py +77 -14
  2. requirements.txt +3 -0
  3. utils/modal_job_submission.py +176 -27
app.py CHANGED
@@ -2270,10 +2270,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2270
 
2271
  with gr.Row():
2272
  eval_model = gr.Textbox(
2273
- value="openai/gpt-4",
2274
  label="Model",
2275
- info="Model ID (e.g., openai/gpt-4, meta-llama/Llama-3.1-8B-Instruct)",
2276
- placeholder="openai/gpt-4"
2277
  )
2278
 
2279
  eval_provider = gr.Dropdown(
@@ -2462,11 +2462,47 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2462
  # Evaluation Helper Functions
2463
  # ============================================================================
2464
 
2465
- def estimate_job_cost_with_mcp_fallback(model, hardware):
2466
  """
2467
  Estimate cost using historical leaderboard data first,
2468
  then fall back to MCP server if model not found
 
 
 
 
 
 
2469
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2470
  try:
2471
  # Try to get historical data from leaderboard
2472
  df = data_loader.load_leaderboard()
@@ -2480,13 +2516,16 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2480
  avg_duration = model_runs['avg_duration_ms'].mean()
2481
  has_cost_data = model_runs['total_cost_usd'].sum() > 0
2482
 
2483
- return {
2484
  'source': 'historical',
2485
  'total_cost_usd': f"{avg_cost:.4f}",
2486
  'estimated_duration_minutes': f"{(avg_duration / 1000 / 60):.1f}",
2487
  'historical_runs': len(model_runs),
2488
  'has_cost_data': has_cost_data
2489
  }
 
 
 
2490
  else:
2491
  # No historical data - use MCP tool
2492
  print(f"[INFO] No historical data for {model}, using MCP cost estimator")
@@ -2517,7 +2556,7 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2517
  extracted_duration = duration_match.group(0) if duration_match else 'See details below'
2518
 
2519
  # Return with markdown content
2520
- return {
2521
  'source': 'mcp',
2522
  'total_cost_usd': extracted_cost,
2523
  'estimated_duration_minutes': extracted_duration,
@@ -2525,9 +2564,12 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2525
  'has_cost_data': True,
2526
  'markdown_details': result # Include full markdown response
2527
  }
 
 
 
2528
  else:
2529
  # Unexpected response type
2530
- return {
2531
  'source': 'mcp',
2532
  'total_cost_usd': 'N/A',
2533
  'estimated_duration_minutes': 'N/A',
@@ -2535,12 +2577,15 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2535
  'has_cost_data': False,
2536
  'error': f'MCP returned unexpected type: {type(result)}'
2537
  }
 
 
 
2538
  except Exception as mcp_error:
2539
  print(f"[ERROR] MCP cost estimation failed: {mcp_error}")
2540
  import traceback
2541
  traceback.print_exc()
2542
  # Return a result indicating MCP is unavailable
2543
- return {
2544
  'source': 'mcp',
2545
  'total_cost_usd': 'N/A',
2546
  'estimated_duration_minutes': 'N/A',
@@ -2548,14 +2593,17 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2548
  'has_cost_data': False,
2549
  'error': str(mcp_error)
2550
  }
 
 
 
2551
 
2552
  except Exception as e:
2553
  print(f"[ERROR] Cost estimation failed (leaderboard load): {e}")
2554
  return None
2555
 
2556
- def on_hardware_change(model, hardware):
2557
  """Update cost estimate when hardware selection changes"""
2558
- cost_est = estimate_job_cost_with_mcp_fallback(model, hardware)
2559
 
2560
  if cost_est is None:
2561
  # Error occurred
@@ -2583,6 +2631,9 @@ No historical data available for **{model}**.
2583
  cost_display = f"${cost_est['total_cost_usd']}" if cost_est['has_cost_data'] else "N/A (cost tracking not enabled)"
2584
  duration = cost_est['estimated_duration_minutes']
2585
 
 
 
 
2586
  return f"""## 💰 Cost Estimate
2587
 
2588
  **{source_label}**
@@ -2590,7 +2641,7 @@ No historical data available for **{model}**.
2590
  | Metric | Value |
2591
  |--------|-------|
2592
  | **Model** | {model} |
2593
- | **Hardware** | {hardware.upper()} |
2594
  | **Estimated Cost** | {cost_display} |
2595
  | **Duration** | {duration} minutes |
2596
 
@@ -2602,13 +2653,18 @@ No historical data available for **{model}**.
2602
  # MCP Cost Estimator - return the full markdown from MCP
2603
  markdown_details = cost_est.get('markdown_details', '')
2604
 
 
 
 
 
 
2605
  # Add header to identify the source
2606
  header = f"""## 💰 Cost Estimate - AI Analysis
2607
 
2608
  **🤖 Powered by MCP Server + Gemini 2.5 Pro**
2609
 
2610
  *This estimate was generated by AI analysis since no historical data is available for this model.*
2611
-
2612
  ---
2613
 
2614
  """
@@ -2697,13 +2753,14 @@ No historical data available for **{model}**.
2697
  # Success - build success message
2698
  job_id = result.get('job_id', 'unknown')
2699
  hf_job_id = result.get('hf_job_id', job_id) # Get actual HF job ID
 
2700
  job_platform = result.get('platform', infra_provider)
2701
  job_hardware = result.get('hardware', hardware)
2702
  job_status = result.get('status', 'submitted')
2703
  job_message = result.get('message', '')
2704
 
2705
  # Estimate cost
2706
- cost_est = estimate_job_cost_with_mcp_fallback(model, hardware)
2707
  has_cost_estimate = cost_est is not None
2708
 
2709
  cost_info_html = ""
@@ -2770,9 +2827,15 @@ No historical data available for **{model}**.
2770
  <div style="background: rgba(255,255,255,0.15); padding: 15px; border-radius: 5px; margin: 15px 0;">
2771
  <div style="font-size: 0.9em; opacity: 0.9; margin-bottom: 5px;">Run ID (SMOLTRACE)</div>
2772
  <div style="font-family: monospace; font-size: 0.95em; font-weight: bold;">{job_id}</div>
 
 
 
 
 
2773
  <div style="font-size: 0.9em; opacity: 0.9; margin-top: 10px; margin-bottom: 5px;">HF Job ID</div>
2774
  <div style="font-family: monospace; font-size: 0.95em; font-weight: bold;">{hf_job_id}</div>
2775
  <div style="font-size: 0.8em; opacity: 0.8; margin-top: 8px;">Use this ID to monitor: <code style="background: rgba(0,0,0,0.2); padding: 2px 6px; border-radius: 3px;">hf jobs inspect {hf_job_id}</code></div>
 
2776
  </div>
2777
 
2778
  <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 10px; margin-top: 15px;">
@@ -3646,7 +3709,7 @@ Result: {result}
3646
 
3647
  eval_estimate_btn.click(
3648
  fn=on_hardware_change,
3649
- inputs=[eval_model, eval_hardware],
3650
  outputs=[eval_cost_estimate]
3651
  )
3652
 
 
2270
 
2271
  with gr.Row():
2272
  eval_model = gr.Textbox(
2273
+ value="openai/gpt-4.1-nano",
2274
  label="Model",
2275
+ info="Model ID (e.g., openai/gpt-4.1-nano, meta-llama/Llama-3.1-8B-Instruct)",
2276
+ placeholder="openai/gpt-4.1-nano"
2277
  )
2278
 
2279
  eval_provider = gr.Dropdown(
 
2462
  # Evaluation Helper Functions
2463
  # ============================================================================
2464
 
2465
+ def estimate_job_cost_with_mcp_fallback(model, hardware, provider="litellm", infrastructure="HuggingFace Jobs"):
2466
  """
2467
  Estimate cost using historical leaderboard data first,
2468
  then fall back to MCP server if model not found
2469
+
2470
+ Args:
2471
+ model: Model name
2472
+ hardware: Hardware selection from UI
2473
+ provider: Provider type (litellm, transformers, etc.)
2474
+ infrastructure: Infrastructure provider (Modal, HuggingFace Jobs)
2475
  """
2476
+ # Handle auto-selection for both infrastructure providers
2477
+ selected_hardware_display = None
2478
+
2479
+ if hardware == "auto":
2480
+ if infrastructure == "Modal":
2481
+ # Modal auto-selection
2482
+ from utils.modal_job_submission import _auto_select_modal_hardware
2483
+ modal_gpu = _auto_select_modal_hardware(provider, model)
2484
+ selected_hardware_display = f"auto → **{modal_gpu or 'CPU'}** (Modal)"
2485
+
2486
+ # Map Modal GPU names to HF Jobs equivalent for cost estimation
2487
+ modal_to_hf_map = {
2488
+ None: "cpu-basic", # CPU
2489
+ "T4": "t4-small",
2490
+ "L4": "l4x1",
2491
+ "A10G": "a10g-small",
2492
+ "L40S": "a10g-large",
2493
+ "A100": "a100-large",
2494
+ "A100-80GB": "a100-large", # Use a100-large as proxy for cost
2495
+ "H100": "a100-large", # Use a100 as proxy
2496
+ "H200": "a100-large", # Use a100 as proxy
2497
+ }
2498
+ hardware = modal_to_hf_map.get(modal_gpu, "a10g-small")
2499
+ else:
2500
+ # HuggingFace Jobs auto-selection
2501
+ from utils.hf_jobs_submission import _auto_select_hf_hardware
2502
+ hf_hardware = _auto_select_hf_hardware(provider, model)
2503
+ selected_hardware_display = f"auto → **{hf_hardware}** (HF Jobs)"
2504
+ hardware = hf_hardware
2505
+
2506
  try:
2507
  # Try to get historical data from leaderboard
2508
  df = data_loader.load_leaderboard()
 
2516
  avg_duration = model_runs['avg_duration_ms'].mean()
2517
  has_cost_data = model_runs['total_cost_usd'].sum() > 0
2518
 
2519
+ result = {
2520
  'source': 'historical',
2521
  'total_cost_usd': f"{avg_cost:.4f}",
2522
  'estimated_duration_minutes': f"{(avg_duration / 1000 / 60):.1f}",
2523
  'historical_runs': len(model_runs),
2524
  'has_cost_data': has_cost_data
2525
  }
2526
+ if selected_hardware_display:
2527
+ result['hardware_display'] = selected_hardware_display
2528
+ return result
2529
  else:
2530
  # No historical data - use MCP tool
2531
  print(f"[INFO] No historical data for {model}, using MCP cost estimator")
 
2556
  extracted_duration = duration_match.group(0) if duration_match else 'See details below'
2557
 
2558
  # Return with markdown content
2559
+ result_dict = {
2560
  'source': 'mcp',
2561
  'total_cost_usd': extracted_cost,
2562
  'estimated_duration_minutes': extracted_duration,
 
2564
  'has_cost_data': True,
2565
  'markdown_details': result # Include full markdown response
2566
  }
2567
+ if selected_hardware_display:
2568
+ result_dict['hardware_display'] = selected_hardware_display
2569
+ return result_dict
2570
  else:
2571
  # Unexpected response type
2572
+ result_dict = {
2573
  'source': 'mcp',
2574
  'total_cost_usd': 'N/A',
2575
  'estimated_duration_minutes': 'N/A',
 
2577
  'has_cost_data': False,
2578
  'error': f'MCP returned unexpected type: {type(result)}'
2579
  }
2580
+ if selected_hardware_display:
2581
+ result_dict['hardware_display'] = selected_hardware_display
2582
+ return result_dict
2583
  except Exception as mcp_error:
2584
  print(f"[ERROR] MCP cost estimation failed: {mcp_error}")
2585
  import traceback
2586
  traceback.print_exc()
2587
  # Return a result indicating MCP is unavailable
2588
+ result_dict = {
2589
  'source': 'mcp',
2590
  'total_cost_usd': 'N/A',
2591
  'estimated_duration_minutes': 'N/A',
 
2593
  'has_cost_data': False,
2594
  'error': str(mcp_error)
2595
  }
2596
+ if selected_hardware_display:
2597
+ result_dict['hardware_display'] = selected_hardware_display
2598
+ return result_dict
2599
 
2600
  except Exception as e:
2601
  print(f"[ERROR] Cost estimation failed (leaderboard load): {e}")
2602
  return None
2603
 
2604
+ def on_hardware_change(model, hardware, provider, infrastructure):
2605
  """Update cost estimate when hardware selection changes"""
2606
+ cost_est = estimate_job_cost_with_mcp_fallback(model, hardware, provider, infrastructure)
2607
 
2608
  if cost_est is None:
2609
  # Error occurred
 
2631
  cost_display = f"${cost_est['total_cost_usd']}" if cost_est['has_cost_data'] else "N/A (cost tracking not enabled)"
2632
  duration = cost_est['estimated_duration_minutes']
2633
 
2634
+ # Use custom hardware display if available, otherwise show hardware as-is
2635
+ hardware_display = cost_est.get('hardware_display', hardware.upper())
2636
+
2637
  return f"""## 💰 Cost Estimate
2638
 
2639
  **{source_label}**
 
2641
  | Metric | Value |
2642
  |--------|-------|
2643
  | **Model** | {model} |
2644
+ | **Hardware** | {hardware_display} |
2645
  | **Estimated Cost** | {cost_display} |
2646
  | **Duration** | {duration} minutes |
2647
 
 
2653
  # MCP Cost Estimator - return the full markdown from MCP
2654
  markdown_details = cost_est.get('markdown_details', '')
2655
 
2656
+ # Add hardware selection note if applicable
2657
+ hardware_note = ""
2658
+ if cost_est.get('hardware_display'):
2659
+ hardware_note = f"\n\n**Hardware**: {cost_est['hardware_display']}\n\n"
2660
+
2661
  # Add header to identify the source
2662
  header = f"""## 💰 Cost Estimate - AI Analysis
2663
 
2664
  **🤖 Powered by MCP Server + Gemini 2.5 Pro**
2665
 
2666
  *This estimate was generated by AI analysis since no historical data is available for this model.*
2667
+ {hardware_note}
2668
  ---
2669
 
2670
  """
 
2753
  # Success - build success message
2754
  job_id = result.get('job_id', 'unknown')
2755
  hf_job_id = result.get('hf_job_id', job_id) # Get actual HF job ID
2756
+ modal_call_id = result.get('modal_call_id', None) # Get Modal call ID if available
2757
  job_platform = result.get('platform', infra_provider)
2758
  job_hardware = result.get('hardware', hardware)
2759
  job_status = result.get('status', 'submitted')
2760
  job_message = result.get('message', '')
2761
 
2762
  # Estimate cost
2763
+ cost_est = estimate_job_cost_with_mcp_fallback(model, hardware, provider, infra_provider)
2764
  has_cost_estimate = cost_est is not None
2765
 
2766
  cost_info_html = ""
 
2827
  <div style="background: rgba(255,255,255,0.15); padding: 15px; border-radius: 5px; margin: 15px 0;">
2828
  <div style="font-size: 0.9em; opacity: 0.9; margin-bottom: 5px;">Run ID (SMOLTRACE)</div>
2829
  <div style="font-family: monospace; font-size: 0.95em; font-weight: bold;">{job_id}</div>
2830
+ {f'''
2831
+ <div style="font-size: 0.9em; opacity: 0.9; margin-top: 10px; margin-bottom: 5px;">Modal Call ID</div>
2832
+ <div style="font-family: monospace; font-size: 0.95em; font-weight: bold;">{modal_call_id}</div>
2833
+ <div style="font-size: 0.8em; opacity: 0.8; margin-top: 8px;">View on Modal Dashboard: <a href="https://modal.com/apps" target="_blank" style="color: rgba(255,255,255,0.9);">https://modal.com/apps</a></div>
2834
+ ''' if modal_call_id else f'''
2835
  <div style="font-size: 0.9em; opacity: 0.9; margin-top: 10px; margin-bottom: 5px;">HF Job ID</div>
2836
  <div style="font-family: monospace; font-size: 0.95em; font-weight: bold;">{hf_job_id}</div>
2837
  <div style="font-size: 0.8em; opacity: 0.8; margin-top: 8px;">Use this ID to monitor: <code style="background: rgba(0,0,0,0.2); padding: 2px 6px; border-radius: 3px;">hf jobs inspect {hf_job_id}</code></div>
2838
+ '''}
2839
  </div>
2840
 
2841
  <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 10px; margin-top: 15px;">
 
3709
 
3710
  eval_estimate_btn.click(
3711
  fn=on_hardware_change,
3712
+ inputs=[eval_model, eval_hardware, eval_provider, eval_infrastructure],
3713
  outputs=[eval_cost_estimate]
3714
  )
3715
 
requirements.txt CHANGED
@@ -35,3 +35,6 @@ smolagents>=1.22.0
35
  smolagents[mcp]>=1.22.0 # MCP client support
36
  google-generativeai>=0.3.0 # For Gemini integration
37
  litellm>=1.0.0 # For LiteLLM model support
 
 
 
 
35
  smolagents[mcp]>=1.22.0 # MCP client support
36
  google-generativeai>=0.3.0 # For Gemini integration
37
  litellm>=1.0.0 # For LiteLLM model support
38
+
39
+ # Modal (for serverless GPU compute)
40
+ modal>=0.64.0
utils/modal_job_submission.py CHANGED
@@ -5,6 +5,7 @@ Handles submission of SMOLTRACE evaluation jobs to Modal's serverless compute pl
5
  """
6
 
7
  import os
 
8
  import uuid
9
  from typing import Dict, Optional, List
10
 
@@ -156,13 +157,41 @@ def submit_modal_job(
156
  try:
157
  app = modal.App(f"smoltrace-eval-{job_id}")
158
 
159
- # Define Modal function
160
- image = modal.Image.debian_slim().pip_install([
161
- "smoltrace[otel,gpu]",
162
- "litellm",
163
- "transformers",
164
- "torch"
165
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  @app.function(
168
  image=image,
@@ -170,40 +199,160 @@ def submit_modal_job(
170
  secrets=[
171
  modal.Secret.from_dict(env_vars)
172
  ],
173
- timeout=3600 # 1 hour timeout
 
174
  )
175
- def run_evaluation():
176
  """Run SMOLTRACE evaluation on Modal"""
177
  import subprocess
178
- result = subprocess.run(command, shell=True, capture_output=True, text=True)
179
- return {
180
- "returncode": result.returncode,
181
- "stdout": result.stdout,
182
- "stderr": result.stderr
183
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- # Submit the job
186
- # Note: Modal doesn't have a direct "submit and return" API like HF Jobs
187
- # For now, we'll return the command that should be run
188
- # In production, you'd use Modal's async API or spawn the function
189
 
190
  return {
191
  "success": True,
192
  "job_id": job_id,
 
193
  "platform": "Modal",
194
  "hardware": modal_gpu or "CPU",
195
  "command": command,
196
- "status": "pending",
197
- "message": "Modal job configured. Use Modal CLI to submit: modal run modal_job_submission.py",
198
- "note": "Direct Modal API submission requires async handling. For now, use the generated command with Modal CLI."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  }
200
 
201
  except Exception as e:
202
- return {
203
- "success": False,
204
- "error": f"Failed to create Modal job: {str(e)}",
205
- "job_id": job_id
206
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
 
209
  def _auto_select_modal_hardware(provider: str, model: str) -> Optional[str]:
 
5
  """
6
 
7
  import os
8
+ import sys
9
  import uuid
10
  from typing import Dict, Optional, List
11
 
 
157
  try:
158
  app = modal.App(f"smoltrace-eval-{job_id}")
159
 
160
+ # Detect current Python version dynamically (must match for serialized=True)
161
+ python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
162
+
163
+ # Define Modal function with appropriate base image
164
+ # Note: Must match local Python version when using serialized=True
165
+ if modal_gpu:
166
+ # Use GPU-optimized image with CUDA for GPU jobs (using latest stable CUDA)
167
+ image = modal.Image.from_registry(
168
+ "nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04",
169
+ add_python=python_version # Dynamically match current environment
170
+ ).pip_install([
171
+ "smoltrace",
172
+ "ddgs", # DuckDuckGo search
173
+ "litellm",
174
+ "transformers",
175
+ "torch",
176
+ "accelerate", # Required for GPU device_map
177
+ "bitsandbytes", # For quantization support
178
+ "sentencepiece", # For some tokenizers
179
+ "protobuf", # For some models
180
+ "hf_transfer", # Fast HuggingFace downloads
181
+ "nvidia-ml-py" # GPU metrics collection
182
+ ]).env({
183
+ # Enable fast downloads and verbose logging
184
+ "HF_HUB_ENABLE_HF_TRANSFER": "1",
185
+ "TRANSFORMERS_VERBOSITY": "info",
186
+ "HF_HUB_VERBOSITY": "info"
187
+ })
188
+ else:
189
+ # Use lightweight image for CPU jobs
190
+ image = modal.Image.debian_slim(python_version=python_version).pip_install([
191
+ "smoltrace",
192
+ "ddgs", # DuckDuckGo search
193
+ "litellm"
194
+ ])
195
 
196
  @app.function(
197
  image=image,
 
199
  secrets=[
200
  modal.Secret.from_dict(env_vars)
201
  ],
202
+ timeout=3600, # 1 hour timeout
203
+ serialized=True # Required for functions defined in local scope
204
  )
205
+ def run_evaluation(command_to_run: str):
206
  """Run SMOLTRACE evaluation on Modal"""
207
  import subprocess
208
+ import sys
209
+ import os
210
+
211
+ print("=" * 80)
212
+ print(f"Starting SMOLTRACE evaluation on Modal")
213
+ print(f"Command: {command_to_run}")
214
+ print(f"Python version: {sys.version}")
215
+
216
+ # Show GPU info if available
217
+ try:
218
+ import torch
219
+ if torch.cuda.is_available():
220
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
221
+ print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
222
+ except:
223
+ pass
224
+
225
+ print("=" * 80)
226
+ print("\nNote: Model download may take several minutes for large models (14B = ~28GB)")
227
+ print("Downloading and initializing model...\n")
228
+
229
+ try:
230
+ # Run with live output instead of capture_output so we can see progress
231
+ result = subprocess.run(
232
+ command_to_run,
233
+ shell=True,
234
+ capture_output=False, # Stream output in real-time
235
+ text=True
236
+ )
237
+
238
+ # Since we're not capturing, create a success message
239
+ print("\n" + "=" * 80)
240
+ print("EVALUATION COMPLETED")
241
+ print(f"Return code: {result.returncode}")
242
+ print("=" * 80)
243
+
244
+ return {
245
+ "returncode": result.returncode,
246
+ "stdout": "Check Modal logs for full output (streaming mode)",
247
+ "stderr": ""
248
+ }
249
+ except Exception as e:
250
+ error_msg = f"Error running evaluation: {str(e)}"
251
+ print("\n" + "=" * 80)
252
+ print("EVALUATION FAILED")
253
+ print(error_msg)
254
+ print("=" * 80)
255
+ import traceback
256
+ traceback.print_exc()
257
+ return {
258
+ "returncode": -1,
259
+ "stdout": "",
260
+ "stderr": error_msg
261
+ }
262
+
263
+ # Submit the job using Modal's remote() in a background thread
264
+ # Note: spawn() doesn't work well with dynamically created apps
265
+ # remote() ensures the job actually executes, threading keeps UI responsive
266
+ import threading
267
+
268
+ # Store result in a shared dict since we're using threading
269
+ result_container = {"modal_call_id": None, "started": False}
270
+
271
+ def run_job_on_modal():
272
+ """Run the Modal job in background thread"""
273
+ try:
274
+ with app.run():
275
+ # Use remote() instead of spawn() for dynamic apps
276
+ # This ensures the function actually executes
277
+ function_call = run_evaluation.remote(command)
278
+ result_container["started"] = True
279
+ print(f"Modal job completed with return code: {function_call.get('returncode', 'unknown')}")
280
+ except Exception as e:
281
+ print(f"Error running Modal job: {e}")
282
+ result_container["error"] = str(e)
283
+
284
+ # Start the job in a background thread so we don't block the UI
285
+ job_thread = threading.Thread(target=run_job_on_modal, daemon=True)
286
+ job_thread.start()
287
+
288
+ # Give Modal a moment to start the job and capture any immediate errors
289
+ import time
290
+ time.sleep(2)
291
 
292
+ # Use job_id as the tracking ID since remote() doesn't give us a call_id
293
+ modal_call_id = f"modal-{job_id}"
 
 
294
 
295
  return {
296
  "success": True,
297
  "job_id": job_id,
298
+ "modal_call_id": modal_call_id, # Modal's internal function call ID
299
  "platform": "Modal",
300
  "hardware": modal_gpu or "CPU",
301
  "command": command,
302
+ "status": "submitted",
303
+ "message": f"Job successfully submitted to Modal (hardware: {modal_gpu or 'CPU'})",
304
+ "instructions": f"""
305
+ ✅ Job submitted successfully!
306
+
307
+ **Job Details:**
308
+ - Run ID: {job_id}
309
+ - Modal Call ID: {modal_call_id}
310
+ - Hardware: {modal_gpu or "CPU"}
311
+ - Platform: Modal (serverless compute)
312
+
313
+ **What happens next:**
314
+ 1. Job starts running on Modal infrastructure
315
+ 2. For GPU jobs: Model downloads first (14B models = ~28GB, can take 10-15 min)
316
+ 3. SMOLTRACE evaluates your model
317
+ 4. Results are automatically pushed to HuggingFace datasets
318
+ 5. They will appear in TraceMind leaderboard when complete
319
+
320
+ **Monitoring**: Check Modal dashboard for real-time logs and progress:
321
+ https://modal.com/apps
322
+
323
+ **Expected Duration**:
324
+ - CPU jobs (API models): 2-5 minutes
325
+ - GPU jobs (local models): 15-30 minutes (includes model download)
326
+
327
+ **Cost**: Modal charges per-second usage. Estimated cost: $0.01-1.00 depending on model size and hardware.
328
+ """.strip()
329
  }
330
 
331
  except Exception as e:
332
+ error_msg = str(e)
333
+
334
+ # Check for common Modal errors
335
+ if "MODAL_TOKEN_ID" in error_msg or "authentication" in error_msg.lower():
336
+ return {
337
+ "success": False,
338
+ "error": "Modal authentication failed. Please verify your MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in Settings.",
339
+ "job_id": job_id,
340
+ "troubleshooting": """
341
+ **Steps to fix:**
342
+ 1. Go to https://modal.com/settings/tokens
343
+ 2. Create a new token
344
+ 3. Copy Token ID (starts with 'ak-') and Token Secret (starts with 'as-')
345
+ 4. Add them to Settings in TraceMind
346
+ 5. Try again
347
+ """
348
+ }
349
+ else:
350
+ return {
351
+ "success": False,
352
+ "error": f"Failed to submit Modal job: {error_msg}",
353
+ "job_id": job_id,
354
+ "command": command
355
+ }
356
 
357
 
358
  def _auto_select_modal_hardware(provider: str, model: str) -> Optional[str]: