File size: 15,859 Bytes
8c679b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
"""
Job Monitoring Screen for TraceMind-AI
Allows users to monitor HuggingFace Jobs status and view logs
"""

import gradio as gr
import os
from typing import Optional


def create_job_monitoring_screen():
    """
    Create the job monitoring screen for HF Jobs

    Returns:
        gr.Column: Gradio Column component for job monitoring
    """
    with gr.Column(visible=False) as job_monitoring_interface:
        gr.Markdown("""
        # πŸ” Job Monitoring

        Monitor your HuggingFace Jobs in real-time. Check job status, view logs, and track evaluation progress.
        """)

        with gr.Tabs():
            # Tab 1: Single Job Inspection
            with gr.Tab("πŸ“‹ Inspect Job"):
                gr.Markdown("""
                ### Inspect a Specific Job

                Enter a HuggingFace Job ID to view its status and logs.
                """)

                with gr.Row():
                    job_id_input = gr.Textbox(
                        label="HF Job ID",
                        placeholder="e.g., kshitijthakkar/691eb073748f86bfa7144fcc",
                        info="Format: username/job_hash"
                    )

                with gr.Row():
                    inspect_btn = gr.Button("πŸ” Inspect Job", variant="primary")
                    refresh_btn = gr.Button("πŸ”„ Refresh", variant="secondary")

                # Job Status Section
                with gr.Accordion("πŸ“Š Job Status", open=True):
                    job_status_display = gr.Markdown("Enter a Job ID and click 'Inspect Job' to view status")

                # Job Logs Section
                with gr.Accordion("πŸ“œ Job Logs", open=True):
                    with gr.Row():
                        show_logs_btn = gr.Button("πŸ“₯ Load Logs", variant="secondary")
                        auto_refresh_logs = gr.Checkbox(
                            label="Auto-refresh logs (every 5s)",
                            value=False
                        )

                    job_logs_display = gr.Code(
                        label="Job Logs",
                        language="shell",
                        value="Click 'Load Logs' to view job output",
                        lines=20
                    )

            # Tab 2: Recent Jobs List
            with gr.Tab("πŸ“‘ Recent Jobs"):
                gr.Markdown("""
                ### Your Recent Jobs

                View a list of your recent HuggingFace Jobs.
                """)

                with gr.Row():
                    list_jobs_btn = gr.Button("πŸ“‹ Load Recent Jobs", variant="primary")
                    jobs_limit = gr.Slider(
                        minimum=5,
                        maximum=50,
                        value=10,
                        step=5,
                        label="Number of jobs to fetch"
                    )

                recent_jobs_display = gr.Markdown("Click 'Load Recent Jobs' to view your jobs")

            # Tab 3: Job Monitoring Guide
            with gr.Tab("πŸ“– Guide"):
                gr.Markdown("""
                ### Using Job Monitoring

                #### How to Get Your Job ID

                After submitting an evaluation from the "New Evaluation" tab, you'll receive:
                - **Run ID (SMOLTRACE)**: Used for tracking results in datasets (e.g., `job_3a22ceca`)
                - **HF Job ID**: Used for monitoring the actual job (e.g., `kshitijthakkar/691eb073748f86bfa7144fcc`)

                Use the **HF Job ID** here to monitor your job.

                #### Job Status Values

                - **QUEUED**: Job is waiting to start
                - **STARTING**: Job is being initialized
                - **RUNNING**: Job is currently executing
                - **SUCCEEDED**: Job completed successfully
                - **FAILED**: Job encountered an error
                - **CANCELLED**: Job was manually cancelled
                - **STOPPED**: Job was stopped by the system

                #### CLI Commands Reference

                You can also use the HuggingFace CLI to monitor jobs:

                ```bash
                # List your running jobs
                hf jobs ps

                # Inspect a specific job
                hf jobs inspect <job_id>

                # View logs from a job
                hf jobs logs <job_id>

                # Follow logs in real-time
                hf jobs logs <job_id> --follow

                # Cancel a job
                hf jobs cancel <job_id>
                ```

                #### Tips

                - πŸ’‘ **Bookmark your Job ID** after submission for easy access
                - πŸ”„ **Use auto-refresh** for logs when job is running
                - πŸ“Š **Check status regularly** to catch any issues early
                - πŸ“ **Review logs** if your job fails to understand what went wrong
                - 🎯 **Results appear in leaderboard** once job succeeds and uploads datasets
                """)

        # Functions for job monitoring
        def inspect_job(job_id: str):
            """Inspect a specific job's status"""
            import os

            if not job_id or not job_id.strip():
                return gr.update(value="❌ Please enter a Job ID")

            # Check if token is configured before making API call
            token = os.environ.get("HF_TOKEN")
            if not token or not token.strip():
                return gr.update(
                    value="""
### ⚠️ HuggingFace Token Not Configured

**Action Required**:
1. Go to "βš™οΈ Settings" in the sidebar
2. Enter your HuggingFace token (must have "Run Jobs" permission)
3. Click "πŸ’Ύ Save API Keys"
4. Return to this tab and try again
                    """
                )

            from utils.hf_jobs_submission import check_job_status

            result = check_job_status(job_id.strip())

            if not result.get("success"):
                error_msg = result.get('error', 'Unknown error')

                return gr.update(
                    value=f"""
### ❌ Failed to Fetch Job Status

**Error**: {error_msg}

**Job ID**: `{job_id}`

**Troubleshooting**:
- Verify the Job ID format is correct (format: `username/job_hash`)
- Check that the job exists in your account
- Ensure your HF token has the correct permissions
- Token must have **Run Jobs** permission enabled
                    """
                )

            # Format status with emoji
            status = result.get("status", "unknown")
            # Convert status to string if it's an enum
            status_str = str(status).upper() if status else "UNKNOWN"

            status_emoji = {
                "QUEUED": "⏳",
                "STARTING": "πŸ”„",
                "RUNNING": "▢️",
                "SUCCEEDED": "βœ…",
                "COMPLETED": "βœ…",  # Alternative success status
                "FAILED": "❌",
                "ERROR": "❌",  # Alternative failure status
                "CANCELLED": "🚫",
                "CANCELED": "🚫",  # US spelling variant
                "STOPPED": "⏹️",
                "TIMEOUT": "⏱️"
            }.get(status_str, "❓")

            status_color = {
                "QUEUED": "#FFA500",
                "STARTING": "#1E90FF",
                "RUNNING": "#00CED1",
                "SUCCEEDED": "#32CD32",
                "COMPLETED": "#32CD32",  # Alternative success status
                "FAILED": "#DC143C",
                "ERROR": "#DC143C",  # Alternative failure status
                "CANCELLED": "#696969",
                "CANCELED": "#696969",  # US spelling variant
                "STOPPED": "#A9A9A9",
                "TIMEOUT": "#FF8C00"
            }.get(status_str, "#888888")

            created_at = result.get("created_at", "N/A")
            flavor = result.get("flavor", "N/A")
            job_url = result.get("url", None)

            # Format job URL as clickable link
            job_url_display = f"[Open in HuggingFace]({job_url})" if job_url else "N/A"

            return gr.update(
                value=f"""
### {status_emoji} Job Status: <span style="color: {status_color};">{status_str}</span>

**Job ID**: `{job_id}`

#### Details

- **Created**: {created_at}
- **Hardware**: {flavor}
- **Job URL**: {job_url_display}

#### Next Steps

{_get_next_steps(status_str)}

---

πŸ’‘ **Tip**: Use "πŸ“₯ Load Logs" button below to view detailed execution logs and check progress.
                """
            )

        def _get_next_steps(status: str) -> str:
            """Get next steps based on job status"""
            status_upper = str(status).upper() if status else "UNKNOWN"

            if status_upper == "QUEUED":
                return "⏳ Your job is waiting in the queue. It will start soon."
            elif status_upper == "STARTING":
                return "πŸ”„ Your job is being initialized. This usually takes 1-2 minutes."
            elif status_upper == "RUNNING":
                return "▢️ Your job is running! Click 'Load Logs' below to view progress."
            elif status_upper in ["SUCCEEDED", "COMPLETED"]:
                return "βœ… Your job completed successfully! Check the Leaderboard tab for results."
            elif status_upper in ["FAILED", "ERROR"]:
                return "❌ Your job failed. Click 'Load Logs' below to see what went wrong."
            elif status_upper in ["CANCELLED", "CANCELED", "STOPPED"]:
                return "🚫 Your job was stopped. You can submit a new job from the 'New Evaluation' tab."
            elif status_upper == "TIMEOUT":
                return "⏱️ Your job exceeded the time limit. Consider optimizing your model or increasing the timeout."
            else:
                return "❓ Unknown status. Try refreshing or check the HF Jobs dashboard."

        def load_job_logs(job_id: str):
            """Load logs for a specific job"""
            import os

            if not job_id or not job_id.strip():
                return gr.update(value="❌ Please enter a Job ID first")

            # Check if token is configured before making API call
            token = os.environ.get("HF_TOKEN")
            if not token or not token.strip():
                return gr.update(
                    value="⚠️ HuggingFace Token Not Configured\n\nPlease configure your HF token in Settings first."
                )

            from utils.hf_jobs_submission import get_job_logs

            result = get_job_logs(job_id.strip())

            if not result.get("success"):
                return gr.update(
                    value=f"❌ Failed to fetch logs: {result.get('error', 'Unknown error')}\n\nEnsure your HF token has 'Run Jobs' permission."
                )

            logs = result.get("logs", "")
            if not logs or not logs.strip():
                return gr.update(value="ℹ️ No logs available yet. Job may not have started.\n\nTry refreshing after a minute.")

            return gr.update(value=logs)

        def list_recent_jobs(limit: int):
            """List user's recent jobs"""
            import os
            from utils.hf_jobs_submission import list_user_jobs

            # Check if token is configured before making API call
            token = os.environ.get("HF_TOKEN")
            if not token or not token.strip():
                return gr.update(
                    value="""
### ⚠️ HuggingFace Token Not Configured

**Action Required**:
1. Go to "βš™οΈ Settings" in the sidebar
2. Enter your HuggingFace token (must have "Run Jobs" permission)
3. Click "πŸ’Ύ Save API Keys"
4. Return to this tab and try again

**Note**: Your HF token must:
- Start with `hf_`
- Have **Read**, **Write**, AND **Run Jobs** permissions
- Be from a HuggingFace Pro account ($9/month)

Get your token at: https://huggingface.co/settings/tokens
                    """
                )

            result = list_user_jobs(limit=int(limit))

            if not result.get("success"):
                error_msg = result.get('error', 'Unknown error')

                # Check for common error patterns
                if "invalid" in error_msg.lower() or "token" in error_msg.lower():
                    troubleshooting = """
**Troubleshooting**:
- ⚠️ **Token may be invalid** - Regenerate your token at HuggingFace settings
- βœ… Ensure token has **Run Jobs** permission (not just Read/Write)
- βœ… Verify you have an active **HuggingFace Pro account**
- βœ… Token should start with `hf_`
                    """
                else:
                    troubleshooting = """
**Troubleshooting**:
- Refresh this page and try again
- Check your internet connection
- Verify HuggingFace services are operational
                    """

                return gr.update(
                    value=f"""
### ❌ Failed to Fetch Jobs

**Error**: {error_msg}

{troubleshooting}
                    """
                )

            jobs = result.get("jobs", [])
            if not jobs:
                return gr.update(
                    value="""
### ℹ️ No Jobs Found

You haven't submitted any jobs yet.

**Get Started**:
1. Go to the "New Evaluation" tab
2. Configure your model and settings
3. Submit an evaluation job
4. Come back here to monitor progress!
                    """
                )

            # Build jobs table
            jobs_table = "### πŸ“‹ Your Recent Jobs\n\n"
            jobs_table += "| Job ID | Status | Created At |\n"
            jobs_table += "|--------|--------|------------|\n"

            for job in jobs:
                job_id = job.get("job_id", "N/A")
                status = job.get("status", "unknown")
                created = job.get("created_at", "N/A")

                # Convert status to string if it's an enum
                status_str = str(status).upper() if status else "UNKNOWN"

                status_emoji = {
                    "QUEUED": "⏳",
                    "STARTING": "πŸ”„",
                    "RUNNING": "▢️",
                    "SUCCEEDED": "βœ…",
                    "COMPLETED": "βœ…",  # Alternative success status
                    "FAILED": "❌",
                    "ERROR": "❌",  # Alternative failure status
                    "CANCELLED": "🚫",
                    "CANCELED": "🚫",  # US spelling variant
                    "STOPPED": "⏹️",
                    "TIMEOUT": "⏱️"
                }.get(status_str, "❓")

                jobs_table += f"| `{job_id}` | {status_emoji} {status} | {created} |\n"

            jobs_table += f"\n**Total Jobs**: {len(jobs)}\n\n"
            jobs_table += "πŸ’‘ **Tip**: Copy a Job ID and paste it in the 'Inspect Job' tab to view details and logs."

            return gr.update(value=jobs_table)

        # Wire up button events
        inspect_btn.click(
            fn=inspect_job,
            inputs=[job_id_input],
            outputs=[job_status_display]
        )

        refresh_btn.click(
            fn=inspect_job,
            inputs=[job_id_input],
            outputs=[job_status_display]
        )

        show_logs_btn.click(
            fn=load_job_logs,
            inputs=[job_id_input],
            outputs=[job_logs_display]
        )

        list_jobs_btn.click(
            fn=list_recent_jobs,
            inputs=[jobs_limit],
            outputs=[recent_jobs_display]
        )

        # Auto-refresh functionality (handled by Gradio's auto-update)
        # Note: For production, consider using gr.Timer or similar for automatic refreshes

        return job_monitoring_interface


if __name__ == "__main__":
    # For standalone testing
    with gr.Blocks() as demo:
        job_monitoring = create_job_monitoring_screen()
        # Make it visible for standalone testing
        job_monitoring.visible = True
    demo.launch()