Spaces:

MCP-1st-Birthday
/

TraceMind

Running

App Files Files Community

TraceMind / screens /job_monitoring.py

kshitijthakkar

feat: Add comprehensive HF Jobs monitoring system

8c679b3 22 days ago

raw

history blame contribute delete

15.9 kB

	"""
	Job Monitoring Screen for TraceMind-AI
	Allows users to monitor HuggingFace Jobs status and view logs
	"""

	import gradio as gr
	import os
	from typing import Optional


	def create_job_monitoring_screen():
	"""
	Create the job monitoring screen for HF Jobs

	Returns:
	gr.Column: Gradio Column component for job monitoring
	"""
	with gr.Column(visible=False) as job_monitoring_interface:
	gr.Markdown("""
	# 🔍 Job Monitoring

	Monitor your HuggingFace Jobs in real-time. Check job status, view logs, and track evaluation progress.
	""")

	with gr.Tabs():
	# Tab 1: Single Job Inspection
	with gr.Tab("📋 Inspect Job"):
	gr.Markdown("""
	### Inspect a Specific Job

	Enter a HuggingFace Job ID to view its status and logs.
	""")

	with gr.Row():
	job_id_input = gr.Textbox(
	label="HF Job ID",
	placeholder="e.g., kshitijthakkar/691eb073748f86bfa7144fcc",
	info="Format: username/job_hash"
	)

	with gr.Row():
	inspect_btn = gr.Button("🔍 Inspect Job", variant="primary")
	refresh_btn = gr.Button("🔄 Refresh", variant="secondary")

	# Job Status Section
	with gr.Accordion("📊 Job Status", open=True):
	job_status_display = gr.Markdown("Enter a Job ID and click 'Inspect Job' to view status")

	# Job Logs Section
	with gr.Accordion("📜 Job Logs", open=True):
	with gr.Row():
	show_logs_btn = gr.Button("📥 Load Logs", variant="secondary")
	auto_refresh_logs = gr.Checkbox(
	label="Auto-refresh logs (every 5s)",
	value=False
	)

	job_logs_display = gr.Code(
	label="Job Logs",
	language="shell",
	value="Click 'Load Logs' to view job output",
	lines=20
	)

	# Tab 2: Recent Jobs List
	with gr.Tab("📑 Recent Jobs"):
	gr.Markdown("""
	### Your Recent Jobs

	View a list of your recent HuggingFace Jobs.
	""")

	with gr.Row():
	list_jobs_btn = gr.Button("📋 Load Recent Jobs", variant="primary")
	jobs_limit = gr.Slider(
	minimum=5,
	maximum=50,
	value=10,
	step=5,
	label="Number of jobs to fetch"
	)

	recent_jobs_display = gr.Markdown("Click 'Load Recent Jobs' to view your jobs")

	# Tab 3: Job Monitoring Guide
	with gr.Tab("📖 Guide"):
	gr.Markdown("""
	### Using Job Monitoring

	#### How to Get Your Job ID

	After submitting an evaluation from the "New Evaluation" tab, you'll receive:
	- Run ID (SMOLTRACE): Used for tracking results in datasets (e.g., `job_3a22ceca`)
	- HF Job ID: Used for monitoring the actual job (e.g., `kshitijthakkar/691eb073748f86bfa7144fcc`)

	Use the HF Job ID here to monitor your job.

	#### Job Status Values

	- QUEUED: Job is waiting to start
	- STARTING: Job is being initialized
	- RUNNING: Job is currently executing
	- SUCCEEDED: Job completed successfully
	- FAILED: Job encountered an error
	- CANCELLED: Job was manually cancelled
	- STOPPED: Job was stopped by the system

	#### CLI Commands Reference

	You can also use the HuggingFace CLI to monitor jobs:

	```bash
	# List your running jobs
	hf jobs ps

	# Inspect a specific job
	hf jobs inspect <job_id>

	# View logs from a job
	hf jobs logs <job_id>

	# Follow logs in real-time
	hf jobs logs <job_id> --follow

	# Cancel a job
	hf jobs cancel <job_id>
	```

	#### Tips

	- 💡 Bookmark your Job ID after submission for easy access
	- 🔄 Use auto-refresh for logs when job is running
	- 📊 Check status regularly to catch any issues early
	- 📝 Review logs if your job fails to understand what went wrong
	- 🎯 Results appear in leaderboard once job succeeds and uploads datasets
	""")

	# Functions for job monitoring
	def inspect_job(job_id: str):
	"""Inspect a specific job's status"""
	import os

	if not job_id or not job_id.strip():
	return gr.update(value="❌ Please enter a Job ID")

	# Check if token is configured before making API call
	token = os.environ.get("HF_TOKEN")
	if not token or not token.strip():
	return gr.update(
	value="""
	### ⚠️ HuggingFace Token Not Configured

	Action Required:
	1. Go to "⚙️ Settings" in the sidebar
	2. Enter your HuggingFace token (must have "Run Jobs" permission)
	3. Click "💾 Save API Keys"
	4. Return to this tab and try again
	"""
	)

	from utils.hf_jobs_submission import check_job_status

	result = check_job_status(job_id.strip())

	if not result.get("success"):
	error_msg = result.get('error', 'Unknown error')

	return gr.update(
	value=f"""
	### ❌ Failed to Fetch Job Status

	Error: {error_msg}

	Job ID: `{job_id}`

	Troubleshooting:
	- Verify the Job ID format is correct (format: `username/job_hash`)
	- Check that the job exists in your account
	- Ensure your HF token has the correct permissions
	- Token must have Run Jobs permission enabled
	"""
	)

	# Format status with emoji
	status = result.get("status", "unknown")
	# Convert status to string if it's an enum
	status_str = str(status).upper() if status else "UNKNOWN"

	status_emoji = {
	"QUEUED": "⏳",
	"STARTING": "🔄",
	"RUNNING": "▶️",
	"SUCCEEDED": "✅",
	"COMPLETED": "✅", # Alternative success status
	"FAILED": "❌",
	"ERROR": "❌", # Alternative failure status
	"CANCELLED": "🚫",
	"CANCELED": "🚫", # US spelling variant
	"STOPPED": "⏹️",
	"TIMEOUT": "⏱️"
	}.get(status_str, "❓")

	status_color = {
	"QUEUED": "#FFA500",
	"STARTING": "#1E90FF",
	"RUNNING": "#00CED1",
	"SUCCEEDED": "#32CD32",
	"COMPLETED": "#32CD32", # Alternative success status
	"FAILED": "#DC143C",
	"ERROR": "#DC143C", # Alternative failure status
	"CANCELLED": "#696969",
	"CANCELED": "#696969", # US spelling variant
	"STOPPED": "#A9A9A9",
	"TIMEOUT": "#FF8C00"
	}.get(status_str, "#888888")

	created_at = result.get("created_at", "N/A")
	flavor = result.get("flavor", "N/A")
	job_url = result.get("url", None)

	# Format job URL as clickable link
	job_url_display = f"[Open in HuggingFace]({job_url})" if job_url else "N/A"

	return gr.update(
	value=f"""
	### {status_emoji} Job Status: <span style="color: {status_color};">{status_str}</span>

	Job ID: `{job_id}`

	#### Details

	- Created: {created_at}
	- Hardware: {flavor}
	- Job URL: {job_url_display}

	#### Next Steps

	{_get_next_steps(status_str)}

	---

	💡 Tip: Use "📥 Load Logs" button below to view detailed execution logs and check progress.
	"""
	)

	def _get_next_steps(status: str) -> str:
	"""Get next steps based on job status"""
	status_upper = str(status).upper() if status else "UNKNOWN"

	if status_upper == "QUEUED":
	return "⏳ Your job is waiting in the queue. It will start soon."
	elif status_upper == "STARTING":
	return "🔄 Your job is being initialized. This usually takes 1-2 minutes."
	elif status_upper == "RUNNING":
	return "▶️ Your job is running! Click 'Load Logs' below to view progress."
	elif status_upper in ["SUCCEEDED", "COMPLETED"]:
	return "✅ Your job completed successfully! Check the Leaderboard tab for results."
	elif status_upper in ["FAILED", "ERROR"]:
	return "❌ Your job failed. Click 'Load Logs' below to see what went wrong."
	elif status_upper in ["CANCELLED", "CANCELED", "STOPPED"]:
	return "🚫 Your job was stopped. You can submit a new job from the 'New Evaluation' tab."
	elif status_upper == "TIMEOUT":
	return "⏱️ Your job exceeded the time limit. Consider optimizing your model or increasing the timeout."
	else:
	return "❓ Unknown status. Try refreshing or check the HF Jobs dashboard."

	def load_job_logs(job_id: str):
	"""Load logs for a specific job"""
	import os

	if not job_id or not job_id.strip():
	return gr.update(value="❌ Please enter a Job ID first")

	# Check if token is configured before making API call
	token = os.environ.get("HF_TOKEN")
	if not token or not token.strip():
	return gr.update(
	value="⚠️ HuggingFace Token Not Configured\n\nPlease configure your HF token in Settings first."
	)

	from utils.hf_jobs_submission import get_job_logs

	result = get_job_logs(job_id.strip())

	if not result.get("success"):
	return gr.update(
	value=f"❌ Failed to fetch logs: {result.get('error', 'Unknown error')}\n\nEnsure your HF token has 'Run Jobs' permission."
	)

	logs = result.get("logs", "")
	if not logs or not logs.strip():
	return gr.update(value="ℹ️ No logs available yet. Job may not have started.\n\nTry refreshing after a minute.")

	return gr.update(value=logs)

	def list_recent_jobs(limit: int):
	"""List user's recent jobs"""
	import os
	from utils.hf_jobs_submission import list_user_jobs

	# Check if token is configured before making API call
	token = os.environ.get("HF_TOKEN")
	if not token or not token.strip():
	return gr.update(
	value="""
	### ⚠️ HuggingFace Token Not Configured

	Action Required:
	1. Go to "⚙️ Settings" in the sidebar
	2. Enter your HuggingFace token (must have "Run Jobs" permission)
	3. Click "💾 Save API Keys"
	4. Return to this tab and try again

	Note: Your HF token must:
	- Start with `hf_`
	- Have Read, Write, AND Run Jobs permissions
	- Be from a HuggingFace Pro account ($9/month)

	Get your token at: https://huggingface.co/settings/tokens
	"""
	)

	result = list_user_jobs(limit=int(limit))

	if not result.get("success"):
	error_msg = result.get('error', 'Unknown error')

	# Check for common error patterns
	if "invalid" in error_msg.lower() or "token" in error_msg.lower():
	troubleshooting = """
	Troubleshooting:
	- ⚠️ Token may be invalid - Regenerate your token at HuggingFace settings
	- ✅ Ensure token has Run Jobs permission (not just Read/Write)
	- ✅ Verify you have an active HuggingFace Pro account
	- ✅ Token should start with `hf_`
	"""
	else:
	troubleshooting = """
	Troubleshooting:
	- Refresh this page and try again
	- Check your internet connection
	- Verify HuggingFace services are operational
	"""

	return gr.update(
	value=f"""
	### ❌ Failed to Fetch Jobs

	Error: {error_msg}

	{troubleshooting}
	"""
	)

	jobs = result.get("jobs", [])
	if not jobs:
	return gr.update(
	value="""
	### ℹ️ No Jobs Found

	You haven't submitted any jobs yet.

	Get Started:
	1. Go to the "New Evaluation" tab
	2. Configure your model and settings
	3. Submit an evaluation job
	4. Come back here to monitor progress!
	"""
	)

	# Build jobs table
	jobs_table = "### 📋 Your Recent Jobs\n\n"
	jobs_table += "\| Job ID \| Status \| Created At \|\n"
	jobs_table += "\|--------\|--------\|------------\|\n"

	for job in jobs:
	job_id = job.get("job_id", "N/A")
	status = job.get("status", "unknown")
	created = job.get("created_at", "N/A")

	# Convert status to string if it's an enum
	status_str = str(status).upper() if status else "UNKNOWN"

	status_emoji = {
	"QUEUED": "⏳",
	"STARTING": "🔄",
	"RUNNING": "▶️",
	"SUCCEEDED": "✅",
	"COMPLETED": "✅", # Alternative success status
	"FAILED": "❌",
	"ERROR": "❌", # Alternative failure status
	"CANCELLED": "🚫",
	"CANCELED": "🚫", # US spelling variant
	"STOPPED": "⏹️",
	"TIMEOUT": "⏱️"
	}.get(status_str, "❓")

	jobs_table += f"\| `{job_id}` \| {status_emoji} {status} \| {created} \|\n"

	jobs_table += f"\nTotal Jobs: {len(jobs)}\n\n"
	jobs_table += "💡 Tip: Copy a Job ID and paste it in the 'Inspect Job' tab to view details and logs."

	return gr.update(value=jobs_table)

	# Wire up button events
	inspect_btn.click(
	fn=inspect_job,
	inputs=[job_id_input],
	outputs=[job_status_display]
	)

	refresh_btn.click(
	fn=inspect_job,
	inputs=[job_id_input],
	outputs=[job_status_display]
	)

	show_logs_btn.click(
	fn=load_job_logs,
	inputs=[job_id_input],
	outputs=[job_logs_display]
	)

	list_jobs_btn.click(
	fn=list_recent_jobs,
	inputs=[jobs_limit],
	outputs=[recent_jobs_display]
	)

	# Auto-refresh functionality (handled by Gradio's auto-update)
	# Note: For production, consider using gr.Timer or similar for automatic refreshes

	return job_monitoring_interface


	if __name__ == "__main__":
	# For standalone testing
	with gr.Blocks() as demo:
	job_monitoring = create_job_monitoring_screen()
	# Make it visible for standalone testing
	job_monitoring.visible = True
	demo.launch()