Calcharles commited on
Commit
22df562
·
1 Parent(s): 057702d

updated demo

Browse files
.gitignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ uv.lock
2
+ test_local.py
3
+ test_visualization.py
4
+ pyproject.toml
5
+
6
+ # Python cache files
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
app.py CHANGED
@@ -1,530 +1,228 @@
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
- import json
4
- import os
5
- from datetime import datetime
6
- from typing import Dict, List, Any
7
- import numpy as np
8
 
9
- # Sample results will be loaded from sample_bulk_submission.json
 
 
 
 
 
 
10
 
11
- def load_results() -> List[Dict]:
12
- """Load results from file or return sample data"""
13
- results_file = "results.json"
14
- if os.path.exists(results_file):
15
- with open(results_file, 'r') as f:
16
- data = json.load(f)
17
- return data.get("results", [])
18
-
19
- # Load sample data from bulk submission file
20
- sample_file = "sample_bulk_submission.json"
21
- if os.path.exists(sample_file):
22
- with open(sample_file, 'r') as f:
23
- sample_data = json.load(f)
24
- # Convert bulk submission format to results format
25
- results = []
26
- for entry in sample_data:
27
- result = {
28
- "model": "EXAMPLE",
29
- "submitter": "Research Team",
30
- "submission_date": "2025-10-09",
31
- "metrics": entry["metrics"],
32
- "task": "multivariate_forecasting",
33
- "domain": entry["domain"],
34
- "category": entry["category"],
35
- "dataset": entry["dataset"],
36
- "dataset_version": entry["dataset_version"],
37
- "paper_url": "https://example.com/paper1",
38
- "code_url": "https://github.com/example/repo1"
39
- }
40
- results.append(result)
41
- return results
42
-
43
- # Fallback empty results
44
- return []
45
 
46
- def save_results(results: List[Dict]):
47
- """Save results to file"""
48
- results_file = "results.json"
49
- data = {"results": results}
50
- with open(results_file, 'w') as f:
51
- json.dump(data, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- def create_leaderboard_df(results: List[Dict]) -> pd.DataFrame:
54
- """Create a pandas DataFrame for the leaderboard display"""
55
- if not results:
56
- return pd.DataFrame()
57
-
58
- # Extract metrics and create flattened structure
59
- flattened_results = []
60
- for result in results:
61
- metrics = result["metrics"]
62
- row = {
63
- "Rank": 0, # Will be calculated
64
- "Model": result["model"],
65
- "Submitter": result["submitter"],
66
- "Submission Date": result["submission_date"],
67
- "MAE": f"{metrics['MAE']:.3f}",
68
- "Uni-MAE": f"{metrics.get('Uni-MAE', 0):.3f}",
69
- "RMSE": f"{metrics['RMSE']:.3f}",
70
- "MAPE": f"{metrics['MAPE']:.1f}%",
71
- "R²": f"{metrics['R²']:.3f}",
72
- "SMAPE": f"{metrics['SMAPE']:.1f}%",
73
- "Uni-Multi": f"{metrics.get('Uni-Multi', 0):.3f}",
74
- "Task": result["task"],
75
- "Domain": result.get("domain", "general"),
76
- "Category": result.get("category", "traditional"),
77
- "Dataset": result.get("dataset", "MUSED-FM"),
78
- "Dataset Version": result["dataset_version"]
79
- }
80
- flattened_results.append(row)
81
-
82
- # Sort by MAE (lower is better) and assign ranks
83
- flattened_results.sort(key=lambda x: float(x["MAE"]))
84
- for i, row in enumerate(flattened_results):
85
- row["Rank"] = i + 1
86
-
87
- return pd.DataFrame(flattened_results)
88
 
89
- def submit_model(model_name: str, submitter_name: str, mae: float, uni_mae: float, rmse: float,
90
- mape: float, r2: float, smape: float, uni_multi: float, task: str,
91
- domain: str, category: str, dataset: str, dataset_version: str, paper_url: str, code_url: str) -> str:
92
- """Handle model submission"""
93
- try:
94
- # Validate inputs
95
- if not model_name or not submitter_name:
96
- return "❌ Model name and submitter name are required!"
97
-
98
- if mae <= 0 or uni_mae <= 0 or rmse <= 0 or mape < 0 or r2 < 0 or smape < 0 or uni_multi <= 0:
99
- return "❌ All metrics must be positive values!"
100
-
101
- # Load existing results
102
- results = load_results()
103
-
104
- # Check if model already exists
105
- for result in results:
106
- if result["model"].lower() == model_name.lower():
107
- return f"❌ Model '{model_name}' already exists in the leaderboard!"
108
-
109
- # Create new submission
110
- new_submission = {
111
- "model": model_name,
112
- "submitter": submitter_name,
113
- "submission_date": datetime.now().strftime("%Y-%m-%d"),
114
- "metrics": {
115
- "MAE": float(mae),
116
- "Uni-MAE": float(uni_mae),
117
- "RMSE": float(rmse),
118
- "MAPE": float(mape),
119
- "R²": float(r2),
120
- "SMAPE": float(smape),
121
- "Uni-Multi": float(uni_multi)
122
- },
123
- "task": task,
124
- "domain": domain,
125
- "category": category,
126
- "dataset": dataset,
127
- "dataset_version": dataset_version,
128
- "paper_url": paper_url,
129
- "code_url": code_url
130
- }
131
-
132
- # Add to results
133
- results.append(new_submission)
134
- save_results(results)
135
 
136
- return f"✅ Successfully submitted model '{model_name}' to the leaderboard!"
 
 
137
 
138
- except Exception as e:
139
- return f"❌ Error submitting model: {str(e)}"
140
-
141
- def update_leaderboard_by_domain(domain: str = "all"):
142
- """Update the leaderboard display filtered by domain"""
143
- results = load_results()
144
- if domain != "all":
145
- results = [r for r in results if r.get("domain", "general") == domain]
146
- df = create_leaderboard_df(results)
147
- return df
148
-
149
- def update_leaderboard_by_category(category: str = "all"):
150
- """Update the leaderboard display filtered by category"""
151
- results = load_results()
152
- if category != "all":
153
- results = [r for r in results if r.get("category", "traditional") == category]
154
- df = create_leaderboard_df(results)
155
- return df
156
-
157
- def update_leaderboard_overall(domain_filter: str = "all", category_filter: str = "all", dataset_filter: str = "all"):
158
- """Update the overall leaderboard display with optional filtering"""
159
- results = load_results()
160
-
161
- # Apply filters
162
- if domain_filter != "all":
163
- results = [r for r in results if r.get("domain", "general") == domain_filter]
164
- if category_filter != "all":
165
- results = [r for r in results if r.get("category", "traditional") == category_filter]
166
- if dataset_filter != "all":
167
- results = [r for r in results if r.get("dataset", "MUSED-FM") == dataset_filter]
168
-
169
- df = create_leaderboard_df(results)
170
- return df
171
-
172
- def get_domains():
173
- """Get list of available domains"""
174
- results = load_results()
175
- domains = list(set([r.get("domain", "general") for r in results]))
176
- return ["all"] + sorted(domains)
177
-
178
- def get_datasets():
179
- """Get list of available datasets"""
180
- results = load_results()
181
- datasets = list(set([r.get("dataset", "MUSED-FM") for r in results]))
182
- return ["all"] + sorted(datasets)
183
-
184
- def get_categories():
185
- """Get list of available categories"""
186
- results = load_results()
187
- categories = list(set([r.get("category", "traditional") for r in results]))
188
- return ["all"] + sorted(categories)
189
-
190
- def get_datasets_by_domain_category(domain: str, category: str):
191
- """Get datasets filtered by domain and category"""
192
- results = load_results()
193
- filtered_results = [r for r in results if
194
- (domain == "all" or r.get("domain", "general") == domain) and
195
- (category == "all" or r.get("category", "traditional") == category)]
196
- datasets = list(set([r.get("dataset", "MUSED-FM") for r in filtered_results]))
197
- return ["all"] + sorted(datasets)
198
-
199
- def submit_bulk_results(model_name: str, submitter_name: str, results_data: str, paper_url: str, code_url: str) -> str:
200
- """Handle bulk submission of results for multiple domain/dataset combinations"""
201
- try:
202
- import json
203
- # Parse the bulk results data
204
- bulk_data = json.loads(results_data)
205
 
206
- if not isinstance(bulk_data, list):
207
- return "❌ Bulk data must be a list of result entries!"
208
-
209
- # Load existing results
210
- existing_results = load_results()
211
-
212
- # Validate and add each result
213
- added_count = 0
214
- for result_entry in bulk_data:
215
- # Validate required fields
216
- required_fields = ["domain", "category", "dataset", "metrics"]
217
- if not all(field in result_entry for field in required_fields):
218
- continue
219
-
220
- # Create submission entry
221
- submission = {
222
- "model": model_name,
223
- "submitter": submitter_name,
224
- "submission_date": datetime.now().strftime("%Y-%m-%d"),
225
- "metrics": result_entry["metrics"],
226
- "task": "multivariate_forecasting",
227
- "domain": result_entry["domain"],
228
- "category": result_entry["category"],
229
- "dataset": result_entry["dataset"],
230
- "dataset_version": result_entry.get("dataset_version", "v1.0"),
231
- "paper_url": paper_url,
232
- "code_url": code_url
233
- }
234
-
235
- # Check for duplicates
236
- is_duplicate = any(
237
- r["model"].lower() == model_name.lower() and
238
- r["domain"] == result_entry["domain"] and
239
- r["category"] == result_entry["category"] and
240
- r["dataset"] == result_entry["dataset"]
241
- for r in existing_results
242
- )
243
 
244
- if not is_duplicate:
245
- existing_results.append(submission)
246
- added_count += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
- if added_count > 0:
249
- save_results(existing_results)
250
- return f"✅ Successfully submitted {added_count} result entries for model '{model_name}'!"
251
- else:
252
- return "❌ No new results were added. Check for duplicates or invalid data."
253
-
254
- except json.JSONDecodeError:
255
- return "❌ Invalid JSON format in bulk results data!"
256
- except Exception as e:
257
- return f"❌ Error submitting bulk results: {str(e)}"
258
-
259
- # Create the Gradio interface
260
- with gr.Blocks(title="MUSED-FM Leaderboard", theme=gr.themes.Soft()) as demo:
261
- gr.Markdown("""
262
- # 🏆 MUSED-FM Leaderboard
263
-
264
- Welcome to the MUSED-FM (Multivariate Time Series Dataset) Leaderboard! This leaderboard tracks the performance of different models on multivariate time series forecasting tasks across various domains and datasets.
265
-
266
- ## 📊 Evaluation Metrics
267
- - **MAE**: Mean Absolute Error (lower is better)
268
- - **Uni-MAE**: Univariate Mean Absolute Error (lower is better)
269
- - **RMSE**: Root Mean Square Error (lower is better)
270
- - **MAPE**: Mean Absolute Percentage Error (lower is better)
271
- - **R²**: Coefficient of Determination (higher is better)
272
- - **SMAPE**: Symmetric Mean Absolute Percentage Error (lower is better)
273
- - **Uni-Multi**: Univariate-Multivariate comparison metric (lower is better)
274
-
275
- ## 🎯 Tasks
276
- - **multivariate_forecasting**: Multivariate time series forecasting
277
-
278
- ## 🌐 Domains & Categories
279
- - **Causal Model** (synthetic): Synthetic causal modeling datasets
280
- - **Dynamic** (synthetic): Dynamic system datasets
281
- - **Energy** (traditional): Energy consumption and production
282
- - **Engineering** (traditional): Engineering sensor data
283
- - **Environment** (traditional): Environmental monitoring
284
- - **Finance** (traditional): Financial time series
285
- - **Health** (traditional): Medical and health data
286
- - **Image** (sequential): Image-based time series
287
- - **Public Info** (traditional): Public information datasets
288
- - **Sales** (traditional): Sales and pricing data
289
- - **Scientific** (sequential): Scientific simulation data
290
- - **Stock** (collections): Stock market data
291
- - **Text** (sequential): Text-based time series
292
- - **Video** (sequential): Video-based time series
293
- - **Web** (traditional): Web analytics data
294
- - **Wikipedia** (collections): Wikipedia usage data
295
- """)
296
-
297
- with gr.Tab("📈 Overall Leaderboard"):
298
  with gr.Row():
299
- domain_filter = gr.Dropdown(
300
- choices=get_domains(),
301
- value="all",
302
- label="Filter by Domain",
303
- interactive=True
304
- )
305
- category_filter = gr.Dropdown(
306
- choices=get_categories(),
307
- value="all",
308
- label="Filter by Category",
309
- interactive=True
310
- )
311
- dataset_filter = gr.Dropdown(
312
- choices=get_datasets(),
313
- value="all",
314
- label="Filter by Dataset",
315
- interactive=True
316
- )
317
 
318
- leaderboard_df = gr.Dataframe(
319
- value=update_leaderboard_overall(),
320
- headers=["Rank", "Model", "Submitter", "Submission Date", "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi", "Task", "Domain", "Category", "Dataset", "Dataset Version"],
321
- datatype=["number", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str"],
322
- interactive=False,
323
- label="MUSED-FM Overall Leaderboard"
324
- )
325
 
326
- refresh_btn = gr.Button("🔄 Refresh Leaderboard", variant="secondary")
327
- refresh_btn.click(
328
- fn=lambda d, c, ds: update_leaderboard_overall(d, c, ds),
329
- inputs=[domain_filter, category_filter, dataset_filter],
330
- outputs=leaderboard_df
331
- )
332
-
333
- with gr.Tab("🏢 By Domain"):
334
- gr.Markdown("### Performance by Domain")
 
335
 
336
- domain_leaderboard = gr.Dataframe(
337
- value=update_leaderboard_by_domain(),
338
- headers=["Rank", "Model", "Submitter", "Submission Date", "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi", "Task", "Category", "Dataset", "Dataset Version"],
339
- datatype=["number", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str"],
340
- interactive=False,
341
- label="Domain-Specific Leaderboard"
342
- )
343
 
344
- domain_refresh_btn = gr.Button("🔄 Refresh Domain Leaderboard", variant="secondary")
345
- domain_refresh_btn.click(fn=update_leaderboard_by_domain, outputs=domain_leaderboard)
346
-
347
- with gr.Tab("📂 By Category"):
348
- gr.Markdown("### Performance by Category")
349
 
350
- category_leaderboard = gr.Dataframe(
351
- value=update_leaderboard_by_category(),
352
- headers=["Rank", "Model", "Submitter", "Submission Date", "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi", "Task", "Domain", "Dataset", "Dataset Version"],
353
- datatype=["number", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str"],
354
- interactive=False,
355
- label="Category-Specific Leaderboard"
356
  )
357
 
358
- category_refresh_btn = gr.Button("🔄 Refresh Category Leaderboard", variant="secondary")
359
- category_refresh_btn.click(fn=update_leaderboard_by_category, outputs=category_leaderboard)
360
-
361
- with gr.Tab("📊 By Dataset"):
362
- gr.Markdown("### Performance by Dataset")
363
-
364
- dataset_leaderboard = gr.Dataframe(
365
- value=update_leaderboard_by_dataset(),
366
- headers=["Rank", "Model", "Submitter", "Submission Date", "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi", "Task", "Domain", "Category", "Dataset Version"],
367
- datatype=["number", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str"],
368
- interactive=False,
369
- label="Dataset-Specific Leaderboard"
370
  )
371
 
372
- dataset_refresh_btn = gr.Button("🔄 Refresh Dataset Leaderboard", variant="secondary")
373
- dataset_refresh_btn.click(fn=update_leaderboard_by_dataset, outputs=dataset_leaderboard)
374
-
375
- with gr.Tab("📝 Submit Model"):
376
- gr.Markdown("### Submit Your Model Results")
377
-
378
- with gr.Row():
379
- with gr.Column():
380
- model_name = gr.Textbox(label="Model Name", placeholder="e.g., MyTimeSeriesModel")
381
- submitter_name = gr.Textbox(label="Submitter Name", placeholder="Your name or organization")
382
-
383
- gr.Markdown("### Performance Metrics")
384
- mae = gr.Number(label="MAE (Mean Absolute Error)", precision=3)
385
- uni_mae = gr.Number(label="Uni-MAE (Univariate MAE)", precision=3)
386
- rmse = gr.Number(label="RMSE (Root Mean Square Error)", precision=3)
387
- mape = gr.Number(label="MAPE (Mean Absolute Percentage Error)", precision=1)
388
- r2 = gr.Number(label="R² (Coefficient of Determination)", precision=3)
389
- smape = gr.Number(label="SMAPE (Symmetric MAPE)", precision=1)
390
- uni_multi = gr.Number(label="Uni-Multi (Univariate-Multivariate)", precision=3)
391
-
392
- with gr.Column():
393
- task = gr.Dropdown(
394
- choices=["multivariate_forecasting"],
395
- value="multivariate_forecasting",
396
- label="Task"
397
- )
398
- domain = gr.Dropdown(
399
- choices=["Causal Model", "Dynamic", "Energy", "Engineering", "Environment", "Finance", "Health", "Image", "Public Info", "Sales", "Scientific", "Stock", "Text", "Video", "Web", "Wikipedia"],
400
- value="Energy",
401
- label="Domain"
402
- )
403
- category = gr.Dropdown(
404
- choices=["synthetic", "traditional", "sequential", "collections"],
405
- value="traditional",
406
- label="Category"
407
- )
408
- dataset = gr.Textbox(label="Dataset Name", placeholder="e.g., ecl, fred_md1, large_convlag_synin_s")
409
- dataset_version = gr.Textbox(label="Dataset Version", value="v1.0")
410
- paper_url = gr.Textbox(label="Paper URL (optional)", placeholder="https://arxiv.org/abs/...")
411
- code_url = gr.Textbox(label="Code URL (optional)", placeholder="https://github.com/...")
412
-
413
- submit_btn = gr.Button("🚀 Submit Model", variant="primary")
414
- submission_status = gr.Textbox(label="Submission Status", interactive=False)
415
-
416
- submit_btn.click(
417
- fn=submit_model,
418
- inputs=[model_name, submitter_name, mae, uni_mae, rmse, mape, r2, smape, uni_multi, task, domain, category, dataset, dataset_version, paper_url, code_url],
419
- outputs=submission_status
420
  )
421
-
422
- with gr.Tab("📦 Bulk Submit"):
423
- gr.Markdown("### Bulk Submit Results for Multiple Domain/Dataset Combinations")
424
- gr.Markdown("""
425
- **Format**: Submit a JSON array of results. Each entry should contain:
426
- ```json
427
- [
428
- {
429
- "domain": "Energy",
430
- "category": "traditional",
431
- "dataset": "ecl",
432
- "dataset_version": "v1.0",
433
- "metrics": {
434
- "MAE": 10.0,
435
- "Uni-MAE": 20.0,
436
- "RMSE": 10.0,
437
- "MAPE": 10.0,
438
- "R²": 10.0,
439
- "SMAPE": 10.0,
440
- "Uni-Multi": 10.0
441
- }
442
- }
443
- ]
444
- ```
445
- """)
446
 
447
- bulk_model_name = gr.Textbox(label="Model Name", placeholder="e.g., MyTimeSeriesModel")
448
- bulk_submitter_name = gr.Textbox(label="Submitter Name", placeholder="Your name or organization")
449
- bulk_results_data = gr.Textbox(
450
- label="Bulk Results Data (JSON)",
451
- placeholder="Paste your JSON array here...",
452
- lines=10
453
  )
454
- bulk_paper_url = gr.Textbox(label="Paper URL (optional)", placeholder="https://arxiv.org/abs/...")
455
- bulk_code_url = gr.Textbox(label="Code URL (optional)", placeholder="https://github.com/...")
456
-
457
- bulk_submit_btn = gr.Button("📦 Submit Bulk Results", variant="primary")
458
- bulk_submission_status = gr.Textbox(label="Bulk Submission Status", interactive=False)
459
 
460
- bulk_submit_btn.click(
461
- fn=submit_bulk_results,
462
- inputs=[bulk_model_name, bulk_submitter_name, bulk_results_data, bulk_paper_url, bulk_code_url],
463
- outputs=bulk_submission_status
464
  )
465
-
466
- with gr.Tab("📋 Dataset Info"):
467
- gr.Markdown("""
468
- ## MUSED-FM Dataset Information
469
-
470
- ### Overview
471
- MUSED-FM is a comprehensive multivariate time series dataset designed for forecasting tasks. The dataset contains multiple time series with various characteristics and complexities.
472
-
473
- ### Dataset Characteristics
474
- - **Type**: Multivariate Time Series
475
- - **Domain**: General forecasting tasks
476
- - **Features**: Multiple variables per time series
477
- - **Temporal Resolution**: Various (hourly, daily, etc.)
478
-
479
- ### Evaluation Protocol
480
- 1. Models are evaluated on held-out test sets
481
- 2. Standard train/validation/test splits are provided
482
- 3. Multiple evaluation metrics are used for comprehensive assessment
483
- 4. Results should be reproducible and include proper citations
484
 
485
- ### Submission Guidelines
486
- - Provide accurate performance metrics
487
- - Include links to papers and code when available
488
- - Ensure reproducibility of results
489
- - Follow ethical AI practices
490
 
491
- ### Contact
492
- For questions about the dataset or leaderboard, please contact the maintainers.
493
- """)
 
 
 
494
 
495
- with gr.Tab("📊 Statistics"):
496
- gr.Markdown("### Leaderboard Statistics")
497
-
498
- def get_stats():
499
- results = load_results()
500
- if not results:
501
- return "No submissions yet."
502
-
503
- total_models = len(results)
504
- avg_mae = np.mean([r["metrics"]["MAE"] for r in results])
505
- avg_rmse = np.mean([r["metrics"]["RMSE"] for r in results])
506
- avg_r2 = np.mean([r["metrics"]["R²"] for r in results])
507
-
508
- best_mae = min([r["metrics"]["MAE"] for r in results])
509
- best_r2 = max([r["metrics"]["R²"] for r in results])
510
-
511
- stats_text = f"""
512
- **Total Submissions**: {total_models}
513
-
514
- **Average Performance**:
515
- - MAE: {avg_mae:.3f}
516
- - RMSE: {avg_rmse:.3f}
517
- - R²: {avg_r2:.3f}
518
-
519
- **Best Performance**:
520
- - Best MAE: {best_mae:.3f}
521
- - Best R²: {best_r2:.3f}
522
- """
523
- return stats_text
524
-
525
- stats_display = gr.Markdown(value=get_stats())
526
- refresh_stats_btn = gr.Button("🔄 Refresh Statistics")
527
- refresh_stats_btn.click(fn=get_stats, outputs=stats_display)
528
 
 
529
  if __name__ == "__main__":
530
- demo.launch()
 
 
1
+ """
2
+ MUSED-FM Leaderboard - Main Gradio Application
3
+ Following GIFT-Eval import structure with custom layout
4
+ """
5
+
6
  import gradio as gr
7
  import pandas as pd
 
 
 
 
 
8
 
9
+ # Optional imports for production features
10
+ try:
11
+ from apscheduler.schedulers.background import BackgroundScheduler
12
+ SCHEDULER_AVAILABLE = True
13
+ except ImportError:
14
+ SCHEDULER_AVAILABLE = False
15
+ print("Warning: apscheduler not available, scheduler features disabled")
16
 
17
+ try:
18
+ from huggingface_hub import snapshot_download
19
+ HUB_AVAILABLE = True
20
+ except ImportError:
21
+ HUB_AVAILABLE = False
22
+ print("Warning: huggingface_hub not available, hub features disabled")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ from src.about import (
25
+ CITATION_BUTTON_LABEL,
26
+ CITATION_BUTTON_TEXT,
27
+ EVALUATION_QUEUE_TEXT,
28
+ INTRODUCTION_TEXT,
29
+ LLM_BENCHMARKS_TEXT,
30
+ TITLE,
31
+ )
32
+ from src.display.css_html_js import custom_css
33
+ from src.display.utils import (
34
+ BENCHMARK_COLS,
35
+ EVAL_COLS,
36
+ EVAL_TYPES,
37
+ ModelInfoColumn,
38
+ ModelType,
39
+ fields,
40
+ WeightType,
41
+ Precision
42
+ )
43
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
44
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_info_df, get_merged_df
45
+ from src.utils import norm_sNavie, pivot_df, get_grouped_dfs, pivot_existed_df, rename_metrics, format_df
46
+ from src.load_results import (
47
+ load_results_with_metadata,
48
+ create_overall_table,
49
+ get_filter_options,
50
+ get_model_metadata,
51
+ create_model_metadata_display,
52
+ get_overall_summary
53
+ )
54
 
55
+ def restart_space():
56
+ API.restart_space(repo_id=REPO_ID)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ def create_leaderboard_interface():
59
+ """Create the main leaderboard interface"""
60
+ demo = gr.Blocks(css=custom_css)
61
+ with demo:
62
+ gr.HTML(TITLE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ # Minimizable description section
65
+ with gr.Accordion("📖 Description", open=False):
66
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
67
 
68
+ # Get filter options
69
+ filter_options = get_filter_options()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # Main content area
72
+ with gr.Row():
73
+ with gr.Column(scale=1):
74
+ # Individual minimizable filter sections
75
+ with gr.Accordion("🔍 Model Search", open=False):
76
+ model_search = gr.Textbox(
77
+ label="Model Search",
78
+ placeholder="Search for a specific model...",
79
+ info="Type part of a model name to filter"
80
+ )
81
+
82
+ with gr.Accordion("📂 Category Filter", open=False):
83
+ category_radio = gr.Radio(
84
+ choices=filter_options["categories"],
85
+ value="all",
86
+ label="Category",
87
+ info="Filter by category"
88
+ )
89
+
90
+ with gr.Accordion("🌐 Domain Filter", open=False):
91
+ domain_radio = gr.Radio(
92
+ choices=filter_options["domains"],
93
+ value="all",
94
+ label="Domain",
95
+ info="Filter by domain"
96
+ )
97
+
98
+ with gr.Accordion("📊 Dataset Filter", open=False):
99
+ dataset_radio = gr.Radio(
100
+ choices=filter_options["datasets"],
101
+ value="all",
102
+ label="Dataset",
103
+ info="Filter by dataset"
104
+ )
105
+
106
+ clear_filters_btn = gr.Button("🗑️ Clear All Filters", variant="secondary")
 
107
 
108
+ with gr.Column(scale=3):
109
+ gr.Markdown("### 📋 Model Rankings")
110
+
111
+ # Main results table
112
+ results_table = gr.Dataframe(
113
+ value=create_overall_table(),
114
+ headers=["Rank", "Model", "Organization", "Datasets", "Domains", "Categories",
115
+ "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi", "Submission Date"],
116
+ datatype=["number", "str", "str", "number", "number", "number",
117
+ "str", "str", "str", "str", "str", "str", "str", "str"],
118
+ interactive=False,
119
+ label="Overall Rankings",
120
+ wrap=True,
121
+ elem_classes=["elegant-table"]
122
+ )
123
+
124
+ refresh_btn = gr.Button("🔄 Refresh Table", variant="primary")
125
+
126
+ # Model metadata section at bottom
127
+ with gr.Accordion("🔍 Model Inspector", open=False):
128
+ with gr.Row():
129
+ with gr.Column(scale=1):
130
+ model_selector = gr.Dropdown(
131
+ choices=filter_options["models"],
132
+ value=None,
133
+ label="Select Model",
134
+ info="Choose a model to view its metadata",
135
+ allow_custom_value=False
136
+ )
137
+
138
+ with gr.Column(scale=3):
139
+ metadata_display = gr.Markdown(
140
+ value="Select a model to view its metadata.",
141
+ label="Model Metadata"
142
+ )
143
 
144
+ # Summary statistics section
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  with gr.Row():
146
+ with gr.Column():
147
+ gr.Markdown("### 📈 Summary Statistics")
148
+ summary_text = gr.Markdown(value=get_overall_summary())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
+ # About section
151
+ with gr.Tabs():
152
+ with gr.Tab("📖 About"):
153
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
154
 
155
+ # Citation section
156
+ with gr.Row():
157
+ with gr.Accordion("📙 Citation", open=False):
158
+ citation_button = gr.Textbox(
159
+ value=CITATION_BUTTON_TEXT,
160
+ label=CITATION_BUTTON_LABEL,
161
+ lines=20,
162
+ elem_id="citation-button",
163
+ show_copy_button=True,
164
+ )
165
 
166
+ # Event handlers
167
+ def update_table(domain, category, dataset, model):
168
+ return create_overall_table(domain, category, dataset, model)
 
 
 
 
169
 
170
+ def clear_filters():
171
+ return "all", "all", "all", ""
 
 
 
172
 
173
+ # Connect filters to table updates
174
+ domain_radio.change(
175
+ fn=update_table,
176
+ inputs=[domain_radio, category_radio, dataset_radio, model_search],
177
+ outputs=results_table
 
178
  )
179
 
180
+ category_radio.change(
181
+ fn=update_table,
182
+ inputs=[domain_radio, category_radio, dataset_radio, model_search],
183
+ outputs=results_table
 
 
 
 
 
 
 
 
184
  )
185
 
186
+ dataset_radio.change(
187
+ fn=update_table,
188
+ inputs=[domain_radio, category_radio, dataset_radio, model_search],
189
+ outputs=results_table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ model_search.change(
193
+ fn=update_table,
194
+ inputs=[domain_radio, category_radio, dataset_radio, model_search],
195
+ outputs=results_table
 
 
196
  )
 
 
 
 
 
197
 
198
+ refresh_btn.click(
199
+ fn=update_table,
200
+ inputs=[domain_radio, category_radio, dataset_radio, model_search],
201
+ outputs=results_table
202
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
+ clear_filters_btn.click(
205
+ fn=clear_filters,
206
+ outputs=[domain_radio, category_radio, dataset_radio, model_search]
207
+ )
 
208
 
209
+ # Model selector event handler
210
+ model_selector.change(
211
+ fn=create_model_metadata_display,
212
+ inputs=[model_selector],
213
+ outputs=[metadata_display]
214
+ )
215
 
216
+ return demo
217
+
218
+ # Start scheduler if available
219
+ if SCHEDULER_AVAILABLE:
220
+ scheduler = BackgroundScheduler()
221
+ scheduler.start()
222
+ else:
223
+ scheduler = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
+ # Launch the demo
226
  if __name__ == "__main__":
227
+ demo = create_leaderboard_interface()
228
+ demo.queue(default_concurrency_limit=40).launch()
demo.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MUSED-FM Leaderboard - Local Demo
3
+ Imports from app.py to ensure identical functionality, loads a local demo leaderboard
4
+ """
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import json
9
+ import os
10
+ from datetime import datetime
11
+ from typing import Dict, List, Any
12
+ import numpy as np
13
+
14
+ # Import from our src package
15
+ from src.load_results import (
16
+ load_results_with_metadata,
17
+ create_overall_table,
18
+ get_filter_options,
19
+ get_model_metadata,
20
+ create_model_metadata_display,
21
+ get_overall_summary
22
+ )
23
+
24
+ # Import the main interface function from app.py
25
+ from app import create_leaderboard_interface
26
+
27
+ # Create the demo using the same function as app.py
28
+ demo = create_leaderboard_interface()
29
+
30
+ # Launch the demo
31
+ if __name__ == "__main__":
32
+ print("🎨 MUSED-FM Leaderboard Local Demo")
33
+ print("=" * 50)
34
+
35
+ try:
36
+ print("📊 Loading data...")
37
+ results = load_results_with_metadata()
38
+ print(f"✅ Loaded {len(results)} results")
39
+
40
+ print("🏗️ Creating interface...")
41
+ print("🚀 Starting local leaderboard...")
42
+ print("📊 Access at: http://localhost:7860")
43
+ print("🔄 Press Ctrl+C to stop")
44
+
45
+ demo.launch(
46
+ server_name="0.0.0.0",
47
+ server_port=7860,
48
+ share=False,
49
+ show_error=True,
50
+ quiet=False
51
+ )
52
+
53
+ except Exception as e:
54
+ print(f"❌ Error: {e}")
55
+ import traceback
56
+ traceback.print_exc()
requirements.txt CHANGED
@@ -1,4 +1,8 @@
1
  gradio==5.49.0
 
2
  pandas>=1.5.0
3
  numpy>=1.21.0
 
 
 
4
  json5>=0.9.0
 
1
  gradio==5.49.0
2
+ gradio-leaderboard
3
  pandas>=1.5.0
4
  numpy>=1.21.0
5
+ plotly
6
+ apscheduler
7
+ huggingface-hub
8
  json5>=0.9.0
requirements_local.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio>=4.0.0
2
+ pandas>=1.5.0
3
+ numpy>=1.21.0
results/sample_submission/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "EXAMPLE",
3
+ "submitter": "Research Team",
4
+ "submission_date": "2025-10-09",
5
+ "task": "multivariate_forecasting",
6
+ "dataset_version": "v1.0",
7
+ "paper_url": "https://example.com/paper1",
8
+ "code_url": "https://github.com/example/repo1"
9
+ }
sample_bulk_submission.json → results/sample_submission/sample_bulk_submission.json RENAMED
File without changes
results/sample_submission2/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "EXAMPLE2",
3
+ "submitter": "Research Team",
4
+ "submission_date": "2025-10-09",
5
+ "task": "multivariate_forecasting",
6
+ "dataset_version": "v1.0",
7
+ "paper_url": "https://example.com/paper2",
8
+ "code_url": "https://github.com/example/repo2"
9
+ }
results/sample_submission2/results.json ADDED
@@ -0,0 +1,1292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "domain": "Causal Model",
4
+ "category": "synthetic",
5
+ "dataset": "large_convlag_synin_s",
6
+ "dataset_version": "v1.0",
7
+ "metrics": {
8
+ "MAE": 15.0,
9
+ "Uni-MAE": 25.0,
10
+ "RMSE": 15.0,
11
+ "MAPE": 15.0,
12
+ "R\u00b2": 15.0,
13
+ "SMAPE": 15.0,
14
+ "Uni-Multi": 15.0
15
+ }
16
+ },
17
+ {
18
+ "domain": "Causal Model",
19
+ "category": "synthetic",
20
+ "dataset": "medium_convlag_synin_s",
21
+ "dataset_version": "v1.0",
22
+ "metrics": {
23
+ "MAE": 15.0,
24
+ "Uni-MAE": 25.0,
25
+ "RMSE": 15.0,
26
+ "MAPE": 15.0,
27
+ "R\u00b2": 15.0,
28
+ "SMAPE": 15.0,
29
+ "Uni-Multi": 15.0
30
+ }
31
+ },
32
+ {
33
+ "domain": "Causal Model",
34
+ "category": "synthetic",
35
+ "dataset": "medium_obslag_synin_s",
36
+ "dataset_version": "v1.0",
37
+ "metrics": {
38
+ "MAE": 15.0,
39
+ "Uni-MAE": 25.0,
40
+ "RMSE": 15.0,
41
+ "MAPE": 15.0,
42
+ "R\u00b2": 15.0,
43
+ "SMAPE": 15.0,
44
+ "Uni-Multi": 15.0
45
+ }
46
+ },
47
+ {
48
+ "domain": "Causal Model",
49
+ "category": "synthetic",
50
+ "dataset": "tiny_convlag_synin_ns",
51
+ "dataset_version": "v1.0",
52
+ "metrics": {
53
+ "MAE": 15.0,
54
+ "Uni-MAE": 25.0,
55
+ "RMSE": 15.0,
56
+ "MAPE": 15.0,
57
+ "R\u00b2": 15.0,
58
+ "SMAPE": 15.0,
59
+ "Uni-Multi": 15.0
60
+ }
61
+ },
62
+ {
63
+ "domain": "Causal Model",
64
+ "category": "synthetic",
65
+ "dataset": "tiny_obslag_synin_ns",
66
+ "dataset_version": "v1.0",
67
+ "metrics": {
68
+ "MAE": 15.0,
69
+ "Uni-MAE": 25.0,
70
+ "RMSE": 15.0,
71
+ "MAPE": 15.0,
72
+ "R\u00b2": 15.0,
73
+ "SMAPE": 15.0,
74
+ "Uni-Multi": 15.0
75
+ }
76
+ },
77
+ {
78
+ "domain": "Dynamic",
79
+ "category": "synthetic",
80
+ "dataset": "dynamic_data_csvs",
81
+ "dataset_version": "v1.0",
82
+ "metrics": {
83
+ "MAE": 15.0,
84
+ "Uni-MAE": 25.0,
85
+ "RMSE": 15.0,
86
+ "MAPE": 15.0,
87
+ "R\u00b2": 15.0,
88
+ "SMAPE": 15.0,
89
+ "Uni-Multi": 15.0
90
+ }
91
+ },
92
+ {
93
+ "domain": "Energy",
94
+ "category": "traditional",
95
+ "dataset": "al_daily",
96
+ "dataset_version": "v1.0",
97
+ "metrics": {
98
+ "MAE": 15.0,
99
+ "Uni-MAE": 25.0,
100
+ "RMSE": 15.0,
101
+ "MAPE": 15.0,
102
+ "R\u00b2": 15.0,
103
+ "SMAPE": 15.0,
104
+ "Uni-Multi": 15.0
105
+ }
106
+ },
107
+ {
108
+ "domain": "Energy",
109
+ "category": "traditional",
110
+ "dataset": "aus_electricity_nsw",
111
+ "dataset_version": "v1.0",
112
+ "metrics": {
113
+ "MAE": 15.0,
114
+ "Uni-MAE": 25.0,
115
+ "RMSE": 15.0,
116
+ "MAPE": 15.0,
117
+ "R\u00b2": 15.0,
118
+ "SMAPE": 15.0,
119
+ "Uni-Multi": 15.0
120
+ }
121
+ },
122
+ {
123
+ "domain": "Energy",
124
+ "category": "traditional",
125
+ "dataset": "aus_electricity_qld",
126
+ "dataset_version": "v1.0",
127
+ "metrics": {
128
+ "MAE": 15.0,
129
+ "Uni-MAE": 25.0,
130
+ "RMSE": 15.0,
131
+ "MAPE": 15.0,
132
+ "R\u00b2": 15.0,
133
+ "SMAPE": 15.0,
134
+ "Uni-Multi": 15.0
135
+ }
136
+ },
137
+ {
138
+ "domain": "Energy",
139
+ "category": "traditional",
140
+ "dataset": "az_daily",
141
+ "dataset_version": "v1.0",
142
+ "metrics": {
143
+ "MAE": 15.0,
144
+ "Uni-MAE": 25.0,
145
+ "RMSE": 15.0,
146
+ "MAPE": 15.0,
147
+ "R\u00b2": 15.0,
148
+ "SMAPE": 15.0,
149
+ "Uni-Multi": 15.0
150
+ }
151
+ },
152
+ {
153
+ "domain": "Energy",
154
+ "category": "traditional",
155
+ "dataset": "az_electricity",
156
+ "dataset_version": "v1.0",
157
+ "metrics": {
158
+ "MAE": 15.0,
159
+ "Uni-MAE": 25.0,
160
+ "RMSE": 15.0,
161
+ "MAPE": 15.0,
162
+ "R\u00b2": 15.0,
163
+ "SMAPE": 15.0,
164
+ "Uni-Multi": 15.0
165
+ }
166
+ },
167
+ {
168
+ "domain": "Energy",
169
+ "category": "traditional",
170
+ "dataset": "cal_daily",
171
+ "dataset_version": "v1.0",
172
+ "metrics": {
173
+ "MAE": 15.0,
174
+ "Uni-MAE": 25.0,
175
+ "RMSE": 15.0,
176
+ "MAPE": 15.0,
177
+ "R\u00b2": 15.0,
178
+ "SMAPE": 15.0,
179
+ "Uni-Multi": 15.0
180
+ }
181
+ },
182
+ {
183
+ "domain": "Energy",
184
+ "category": "traditional",
185
+ "dataset": "cal_electricity",
186
+ "dataset_version": "v1.0",
187
+ "metrics": {
188
+ "MAE": 15.0,
189
+ "Uni-MAE": 25.0,
190
+ "RMSE": 15.0,
191
+ "MAPE": 15.0,
192
+ "R\u00b2": 15.0,
193
+ "SMAPE": 15.0,
194
+ "Uni-Multi": 15.0
195
+ }
196
+ },
197
+ {
198
+ "domain": "Energy",
199
+ "category": "traditional",
200
+ "dataset": "car_daily",
201
+ "dataset_version": "v1.0",
202
+ "metrics": {
203
+ "MAE": 15.0,
204
+ "Uni-MAE": 25.0,
205
+ "RMSE": 15.0,
206
+ "MAPE": 15.0,
207
+ "R\u00b2": 15.0,
208
+ "SMAPE": 15.0,
209
+ "Uni-Multi": 15.0
210
+ }
211
+ },
212
+ {
213
+ "domain": "Energy",
214
+ "category": "traditional",
215
+ "dataset": "car_electricity",
216
+ "dataset_version": "v1.0",
217
+ "metrics": {
218
+ "MAE": 15.0,
219
+ "Uni-MAE": 25.0,
220
+ "RMSE": 15.0,
221
+ "MAPE": 15.0,
222
+ "R\u00b2": 15.0,
223
+ "SMAPE": 15.0,
224
+ "Uni-Multi": 15.0
225
+ }
226
+ },
227
+ {
228
+ "domain": "Energy",
229
+ "category": "traditional",
230
+ "dataset": "central_electricity",
231
+ "dataset_version": "v1.0",
232
+ "metrics": {
233
+ "MAE": 15.0,
234
+ "Uni-MAE": 25.0,
235
+ "RMSE": 15.0,
236
+ "MAPE": 15.0,
237
+ "R\u00b2": 15.0,
238
+ "SMAPE": 15.0,
239
+ "Uni-Multi": 15.0
240
+ }
241
+ },
242
+ {
243
+ "domain": "Energy",
244
+ "category": "traditional",
245
+ "dataset": "co_daily",
246
+ "dataset_version": "v1.0",
247
+ "metrics": {
248
+ "MAE": 15.0,
249
+ "Uni-MAE": 25.0,
250
+ "RMSE": 15.0,
251
+ "MAPE": 15.0,
252
+ "R\u00b2": 15.0,
253
+ "SMAPE": 15.0,
254
+ "Uni-Multi": 15.0
255
+ }
256
+ },
257
+ {
258
+ "domain": "Energy",
259
+ "category": "traditional",
260
+ "dataset": "eastern_electricity",
261
+ "dataset_version": "v1.0",
262
+ "metrics": {
263
+ "MAE": 15.0,
264
+ "Uni-MAE": 25.0,
265
+ "RMSE": 15.0,
266
+ "MAPE": 15.0,
267
+ "R\u00b2": 15.0,
268
+ "SMAPE": 15.0,
269
+ "Uni-Multi": 15.0
270
+ }
271
+ },
272
+ {
273
+ "domain": "Energy",
274
+ "category": "traditional",
275
+ "dataset": "ecl",
276
+ "dataset_version": "v1.0",
277
+ "metrics": {
278
+ "MAE": 15.0,
279
+ "Uni-MAE": 25.0,
280
+ "RMSE": 15.0,
281
+ "MAPE": 15.0,
282
+ "R\u00b2": 15.0,
283
+ "SMAPE": 15.0,
284
+ "Uni-Multi": 15.0
285
+ }
286
+ },
287
+ {
288
+ "domain": "Energy",
289
+ "category": "traditional",
290
+ "dataset": "ercot_load",
291
+ "dataset_version": "v1.0",
292
+ "metrics": {
293
+ "MAE": 15.0,
294
+ "Uni-MAE": 25.0,
295
+ "RMSE": 15.0,
296
+ "MAPE": 15.0,
297
+ "R\u00b2": 15.0,
298
+ "SMAPE": 15.0,
299
+ "Uni-Multi": 15.0
300
+ }
301
+ },
302
+ {
303
+ "domain": "Energy",
304
+ "category": "traditional",
305
+ "dataset": "fl_electricity",
306
+ "dataset_version": "v1.0",
307
+ "metrics": {
308
+ "MAE": 15.0,
309
+ "Uni-MAE": 25.0,
310
+ "RMSE": 15.0,
311
+ "MAPE": 15.0,
312
+ "R\u00b2": 15.0,
313
+ "SMAPE": 15.0,
314
+ "Uni-Multi": 15.0
315
+ }
316
+ },
317
+ {
318
+ "domain": "Energy",
319
+ "category": "traditional",
320
+ "dataset": "id_electricity",
321
+ "dataset_version": "v1.0",
322
+ "metrics": {
323
+ "MAE": 15.0,
324
+ "Uni-MAE": 25.0,
325
+ "RMSE": 15.0,
326
+ "MAPE": 15.0,
327
+ "R\u00b2": 15.0,
328
+ "SMAPE": 15.0,
329
+ "Uni-Multi": 15.0
330
+ }
331
+ },
332
+ {
333
+ "domain": "Energy",
334
+ "category": "traditional",
335
+ "dataset": "mds_microgrid",
336
+ "dataset_version": "v1.0",
337
+ "metrics": {
338
+ "MAE": 15.0,
339
+ "Uni-MAE": 25.0,
340
+ "RMSE": 15.0,
341
+ "MAPE": 15.0,
342
+ "R\u00b2": 15.0,
343
+ "SMAPE": 15.0,
344
+ "Uni-Multi": 15.0
345
+ }
346
+ },
347
+ {
348
+ "domain": "Energy",
349
+ "category": "traditional",
350
+ "dataset": "ne_daily",
351
+ "dataset_version": "v1.0",
352
+ "metrics": {
353
+ "MAE": 15.0,
354
+ "Uni-MAE": 25.0,
355
+ "RMSE": 15.0,
356
+ "MAPE": 15.0,
357
+ "R\u00b2": 15.0,
358
+ "SMAPE": 15.0,
359
+ "Uni-Multi": 15.0
360
+ }
361
+ },
362
+ {
363
+ "domain": "Energy",
364
+ "category": "traditional",
365
+ "dataset": "ne_electricity",
366
+ "dataset_version": "v1.0",
367
+ "metrics": {
368
+ "MAE": 15.0,
369
+ "Uni-MAE": 25.0,
370
+ "RMSE": 15.0,
371
+ "MAPE": 15.0,
372
+ "R\u00b2": 15.0,
373
+ "SMAPE": 15.0,
374
+ "Uni-Multi": 15.0
375
+ }
376
+ },
377
+ {
378
+ "domain": "Energy",
379
+ "category": "traditional",
380
+ "dataset": "nm_daily",
381
+ "dataset_version": "v1.0",
382
+ "metrics": {
383
+ "MAE": 15.0,
384
+ "Uni-MAE": 25.0,
385
+ "RMSE": 15.0,
386
+ "MAPE": 15.0,
387
+ "R\u00b2": 15.0,
388
+ "SMAPE": 15.0,
389
+ "Uni-Multi": 15.0
390
+ }
391
+ },
392
+ {
393
+ "domain": "Energy",
394
+ "category": "traditional",
395
+ "dataset": "northern_electricity",
396
+ "dataset_version": "v1.0",
397
+ "metrics": {
398
+ "MAE": 15.0,
399
+ "Uni-MAE": 25.0,
400
+ "RMSE": 15.0,
401
+ "MAPE": 15.0,
402
+ "R\u00b2": 15.0,
403
+ "SMAPE": 15.0,
404
+ "Uni-Multi": 15.0
405
+ }
406
+ },
407
+ {
408
+ "domain": "Energy",
409
+ "category": "traditional",
410
+ "dataset": "ny_daily",
411
+ "dataset_version": "v1.0",
412
+ "metrics": {
413
+ "MAE": 15.0,
414
+ "Uni-MAE": 25.0,
415
+ "RMSE": 15.0,
416
+ "MAPE": 15.0,
417
+ "R\u00b2": 15.0,
418
+ "SMAPE": 15.0,
419
+ "Uni-Multi": 15.0
420
+ }
421
+ },
422
+ {
423
+ "domain": "Energy",
424
+ "category": "traditional",
425
+ "dataset": "ny_electricity2525",
426
+ "dataset_version": "v1.0",
427
+ "metrics": {
428
+ "MAE": 15.0,
429
+ "Uni-MAE": 25.0,
430
+ "RMSE": 15.0,
431
+ "MAPE": 15.0,
432
+ "R\u00b2": 15.0,
433
+ "SMAPE": 15.0,
434
+ "Uni-Multi": 15.0
435
+ }
436
+ },
437
+ {
438
+ "domain": "Energy",
439
+ "category": "traditional",
440
+ "dataset": "or_electricity",
441
+ "dataset_version": "v1.0",
442
+ "metrics": {
443
+ "MAE": 15.0,
444
+ "Uni-MAE": 25.0,
445
+ "RMSE": 15.0,
446
+ "MAPE": 15.0,
447
+ "R\u00b2": 15.0,
448
+ "SMAPE": 15.0,
449
+ "Uni-Multi": 15.0
450
+ }
451
+ },
452
+ {
453
+ "domain": "Energy",
454
+ "category": "traditional",
455
+ "dataset": "pa_daily",
456
+ "dataset_version": "v1.0",
457
+ "metrics": {
458
+ "MAE": 15.0,
459
+ "Uni-MAE": 25.0,
460
+ "RMSE": 15.0,
461
+ "MAPE": 15.0,
462
+ "R\u00b2": 15.0,
463
+ "SMAPE": 15.0,
464
+ "Uni-Multi": 15.0
465
+ }
466
+ },
467
+ {
468
+ "domain": "Energy",
469
+ "category": "traditional",
470
+ "dataset": "pa_electricity",
471
+ "dataset_version": "v1.0",
472
+ "metrics": {
473
+ "MAE": 15.0,
474
+ "Uni-MAE": 25.0,
475
+ "RMSE": 15.0,
476
+ "MAPE": 15.0,
477
+ "R\u00b2": 15.0,
478
+ "SMAPE": 15.0,
479
+ "Uni-Multi": 15.0
480
+ }
481
+ },
482
+ {
483
+ "domain": "Energy",
484
+ "category": "traditional",
485
+ "dataset": "se_electricity",
486
+ "dataset_version": "v1.0",
487
+ "metrics": {
488
+ "MAE": 15.0,
489
+ "Uni-MAE": 25.0,
490
+ "RMSE": 15.0,
491
+ "MAPE": 15.0,
492
+ "R\u00b2": 15.0,
493
+ "SMAPE": 15.0,
494
+ "Uni-Multi": 15.0
495
+ }
496
+ },
497
+ {
498
+ "domain": "Energy",
499
+ "category": "traditional",
500
+ "dataset": "solar_alabama",
501
+ "dataset_version": "v1.0",
502
+ "metrics": {
503
+ "MAE": 15.0,
504
+ "Uni-MAE": 25.0,
505
+ "RMSE": 15.0,
506
+ "MAPE": 15.0,
507
+ "R\u00b2": 15.0,
508
+ "SMAPE": 15.0,
509
+ "Uni-Multi": 15.0
510
+ }
511
+ },
512
+ {
513
+ "domain": "Energy",
514
+ "category": "traditional",
515
+ "dataset": "southern_electricity",
516
+ "dataset_version": "v1.0",
517
+ "metrics": {
518
+ "MAE": 15.0,
519
+ "Uni-MAE": 25.0,
520
+ "RMSE": 15.0,
521
+ "MAPE": 15.0,
522
+ "R\u00b2": 15.0,
523
+ "SMAPE": 15.0,
524
+ "Uni-Multi": 15.0
525
+ }
526
+ },
527
+ {
528
+ "domain": "Energy",
529
+ "category": "traditional",
530
+ "dataset": "tn_daily",
531
+ "dataset_version": "v1.0",
532
+ "metrics": {
533
+ "MAE": 15.0,
534
+ "Uni-MAE": 25.0,
535
+ "RMSE": 15.0,
536
+ "MAPE": 15.0,
537
+ "R\u00b2": 15.0,
538
+ "SMAPE": 15.0,
539
+ "Uni-Multi": 15.0
540
+ }
541
+ },
542
+ {
543
+ "domain": "Energy",
544
+ "category": "traditional",
545
+ "dataset": "tn_electricity",
546
+ "dataset_version": "v1.0",
547
+ "metrics": {
548
+ "MAE": 15.0,
549
+ "Uni-MAE": 25.0,
550
+ "RMSE": 15.0,
551
+ "MAPE": 15.0,
552
+ "R\u00b2": 15.0,
553
+ "SMAPE": 15.0,
554
+ "Uni-Multi": 15.0
555
+ }
556
+ },
557
+ {
558
+ "domain": "Energy",
559
+ "category": "traditional",
560
+ "dataset": "tx_daily",
561
+ "dataset_version": "v1.0",
562
+ "metrics": {
563
+ "MAE": 15.0,
564
+ "Uni-MAE": 25.0,
565
+ "RMSE": 15.0,
566
+ "MAPE": 15.0,
567
+ "R\u00b2": 15.0,
568
+ "SMAPE": 15.0,
569
+ "Uni-Multi": 15.0
570
+ }
571
+ },
572
+ {
573
+ "domain": "Energy",
574
+ "category": "traditional",
575
+ "dataset": "tx_electricity",
576
+ "dataset_version": "v1.0",
577
+ "metrics": {
578
+ "MAE": 15.0,
579
+ "Uni-MAE": 25.0,
580
+ "RMSE": 15.0,
581
+ "MAPE": 15.0,
582
+ "R\u00b2": 15.0,
583
+ "SMAPE": 15.0,
584
+ "Uni-Multi": 15.0
585
+ }
586
+ },
587
+ {
588
+ "domain": "Energy",
589
+ "category": "traditional",
590
+ "dataset": "western_electricity",
591
+ "dataset_version": "v1.0",
592
+ "metrics": {
593
+ "MAE": 15.0,
594
+ "Uni-MAE": 25.0,
595
+ "RMSE": 15.0,
596
+ "MAPE": 15.0,
597
+ "R\u00b2": 15.0,
598
+ "SMAPE": 15.0,
599
+ "Uni-Multi": 15.0
600
+ }
601
+ },
602
+ {
603
+ "domain": "Engineering",
604
+ "category": "traditional",
605
+ "dataset": "ev-sensors",
606
+ "dataset_version": "v1.0",
607
+ "metrics": {
608
+ "MAE": 15.0,
609
+ "Uni-MAE": 25.0,
610
+ "RMSE": 15.0,
611
+ "MAPE": 15.0,
612
+ "R\u00b2": 15.0,
613
+ "SMAPE": 15.0,
614
+ "Uni-Multi": 15.0
615
+ }
616
+ },
617
+ {
618
+ "domain": "Engineering",
619
+ "category": "traditional",
620
+ "dataset": "voip",
621
+ "dataset_version": "v1.0",
622
+ "metrics": {
623
+ "MAE": 15.0,
624
+ "Uni-MAE": 25.0,
625
+ "RMSE": 15.0,
626
+ "MAPE": 15.0,
627
+ "R\u00b2": 15.0,
628
+ "SMAPE": 15.0,
629
+ "Uni-Multi": 15.0
630
+ }
631
+ },
632
+ {
633
+ "domain": "Environment",
634
+ "category": "traditional",
635
+ "dataset": "beijing_aq",
636
+ "dataset_version": "v1.0",
637
+ "metrics": {
638
+ "MAE": 15.0,
639
+ "Uni-MAE": 25.0,
640
+ "RMSE": 15.0,
641
+ "MAPE": 15.0,
642
+ "R\u00b2": 15.0,
643
+ "SMAPE": 15.0,
644
+ "Uni-Multi": 15.0
645
+ }
646
+ },
647
+ {
648
+ "domain": "Environment",
649
+ "category": "traditional",
650
+ "dataset": "beijing_embassy",
651
+ "dataset_version": "v1.0",
652
+ "metrics": {
653
+ "MAE": 15.0,
654
+ "Uni-MAE": 25.0,
655
+ "RMSE": 15.0,
656
+ "MAPE": 15.0,
657
+ "R\u00b2": 15.0,
658
+ "SMAPE": 15.0,
659
+ "Uni-Multi": 15.0
660
+ }
661
+ },
662
+ {
663
+ "domain": "Environment",
664
+ "category": "traditional",
665
+ "dataset": "causalrivers",
666
+ "dataset_version": "v1.0",
667
+ "metrics": {
668
+ "MAE": 15.0,
669
+ "Uni-MAE": 25.0,
670
+ "RMSE": 15.0,
671
+ "MAPE": 15.0,
672
+ "R\u00b2": 15.0,
673
+ "SMAPE": 15.0,
674
+ "Uni-Multi": 15.0
675
+ }
676
+ },
677
+ {
678
+ "domain": "Environment",
679
+ "category": "traditional",
680
+ "dataset": "gas_sensor",
681
+ "dataset_version": "v1.0",
682
+ "metrics": {
683
+ "MAE": 15.0,
684
+ "Uni-MAE": 25.0,
685
+ "RMSE": 15.0,
686
+ "MAPE": 15.0,
687
+ "R\u00b2": 15.0,
688
+ "SMAPE": 15.0,
689
+ "Uni-Multi": 15.0
690
+ }
691
+ },
692
+ {
693
+ "domain": "Environment",
694
+ "category": "traditional",
695
+ "dataset": "oikolab_weather",
696
+ "dataset_version": "v1.0",
697
+ "metrics": {
698
+ "MAE": 15.0,
699
+ "Uni-MAE": 25.0,
700
+ "RMSE": 15.0,
701
+ "MAPE": 15.0,
702
+ "R\u00b2": 15.0,
703
+ "SMAPE": 15.0,
704
+ "Uni-Multi": 15.0
705
+ }
706
+ },
707
+ {
708
+ "domain": "Environment",
709
+ "category": "traditional",
710
+ "dataset": "open_aq",
711
+ "dataset_version": "v1.0",
712
+ "metrics": {
713
+ "MAE": 15.0,
714
+ "Uni-MAE": 25.0,
715
+ "RMSE": 15.0,
716
+ "MAPE": 15.0,
717
+ "R\u00b2": 15.0,
718
+ "SMAPE": 15.0,
719
+ "Uni-Multi": 15.0
720
+ }
721
+ },
722
+ {
723
+ "domain": "Environment",
724
+ "category": "traditional",
725
+ "dataset": "weather_mpi",
726
+ "dataset_version": "v1.0",
727
+ "metrics": {
728
+ "MAE": 15.0,
729
+ "Uni-MAE": 25.0,
730
+ "RMSE": 15.0,
731
+ "MAPE": 15.0,
732
+ "R\u00b2": 15.0,
733
+ "SMAPE": 15.0,
734
+ "Uni-Multi": 15.0
735
+ }
736
+ },
737
+ {
738
+ "domain": "Finance",
739
+ "category": "traditional",
740
+ "dataset": "fred_md1",
741
+ "dataset_version": "v1.0",
742
+ "metrics": {
743
+ "MAE": 15.0,
744
+ "Uni-MAE": 25.0,
745
+ "RMSE": 15.0,
746
+ "MAPE": 15.0,
747
+ "R\u00b2": 15.0,
748
+ "SMAPE": 15.0,
749
+ "Uni-Multi": 15.0
750
+ }
751
+ },
752
+ {
753
+ "domain": "Finance",
754
+ "category": "traditional",
755
+ "dataset": "fred_md2",
756
+ "dataset_version": "v1.0",
757
+ "metrics": {
758
+ "MAE": 15.0,
759
+ "Uni-MAE": 25.0,
760
+ "RMSE": 15.0,
761
+ "MAPE": 15.0,
762
+ "R\u00b2": 15.0,
763
+ "SMAPE": 15.0,
764
+ "Uni-Multi": 15.0
765
+ }
766
+ },
767
+ {
768
+ "domain": "Finance",
769
+ "category": "traditional",
770
+ "dataset": "fred_md3",
771
+ "dataset_version": "v1.0",
772
+ "metrics": {
773
+ "MAE": 15.0,
774
+ "Uni-MAE": 25.0,
775
+ "RMSE": 15.0,
776
+ "MAPE": 15.0,
777
+ "R\u00b2": 15.0,
778
+ "SMAPE": 15.0,
779
+ "Uni-Multi": 15.0
780
+ }
781
+ },
782
+ {
783
+ "domain": "Finance",
784
+ "category": "traditional",
785
+ "dataset": "fred_md4",
786
+ "dataset_version": "v1.0",
787
+ "metrics": {
788
+ "MAE": 15.0,
789
+ "Uni-MAE": 25.0,
790
+ "RMSE": 15.0,
791
+ "MAPE": 15.0,
792
+ "R\u00b2": 15.0,
793
+ "SMAPE": 15.0,
794
+ "Uni-Multi": 15.0
795
+ }
796
+ },
797
+ {
798
+ "domain": "Finance",
799
+ "category": "traditional",
800
+ "dataset": "fred_md5",
801
+ "dataset_version": "v1.0",
802
+ "metrics": {
803
+ "MAE": 15.0,
804
+ "Uni-MAE": 25.0,
805
+ "RMSE": 15.0,
806
+ "MAPE": 15.0,
807
+ "R\u00b2": 15.0,
808
+ "SMAPE": 15.0,
809
+ "Uni-Multi": 15.0
810
+ }
811
+ },
812
+ {
813
+ "domain": "Finance",
814
+ "category": "traditional",
815
+ "dataset": "fred_md6",
816
+ "dataset_version": "v1.0",
817
+ "metrics": {
818
+ "MAE": 15.0,
819
+ "Uni-MAE": 25.0,
820
+ "RMSE": 15.0,
821
+ "MAPE": 15.0,
822
+ "R\u00b2": 15.0,
823
+ "SMAPE": 15.0,
824
+ "Uni-Multi": 15.0
825
+ }
826
+ },
827
+ {
828
+ "domain": "Finance",
829
+ "category": "traditional",
830
+ "dataset": "fred_md7",
831
+ "dataset_version": "v1.0",
832
+ "metrics": {
833
+ "MAE": 15.0,
834
+ "Uni-MAE": 25.0,
835
+ "RMSE": 15.0,
836
+ "MAPE": 15.0,
837
+ "R\u00b2": 15.0,
838
+ "SMAPE": 15.0,
839
+ "Uni-Multi": 15.0
840
+ }
841
+ },
842
+ {
843
+ "domain": "Finance",
844
+ "category": "traditional",
845
+ "dataset": "fred_md8",
846
+ "dataset_version": "v1.0",
847
+ "metrics": {
848
+ "MAE": 15.0,
849
+ "Uni-MAE": 25.0,
850
+ "RMSE": 15.0,
851
+ "MAPE": 15.0,
852
+ "R\u00b2": 15.0,
853
+ "SMAPE": 15.0,
854
+ "Uni-Multi": 15.0
855
+ }
856
+ },
857
+ {
858
+ "domain": "Health",
859
+ "category": "traditional",
860
+ "dataset": "cgm",
861
+ "dataset_version": "v1.0",
862
+ "metrics": {
863
+ "MAE": 15.0,
864
+ "Uni-MAE": 25.0,
865
+ "RMSE": 15.0,
866
+ "MAPE": 15.0,
867
+ "R\u00b2": 15.0,
868
+ "SMAPE": 15.0,
869
+ "Uni-Multi": 15.0
870
+ }
871
+ },
872
+ {
873
+ "domain": "Health",
874
+ "category": "traditional",
875
+ "dataset": "sleep_lab",
876
+ "dataset_version": "v1.0",
877
+ "metrics": {
878
+ "MAE": 15.0,
879
+ "Uni-MAE": 25.0,
880
+ "RMSE": 15.0,
881
+ "MAPE": 15.0,
882
+ "R\u00b2": 15.0,
883
+ "SMAPE": 15.0,
884
+ "Uni-Multi": 15.0
885
+ }
886
+ },
887
+ {
888
+ "domain": "Image",
889
+ "category": "sequential",
890
+ "dataset": "cifar150_timeseries_csvs",
891
+ "dataset_version": "v1.0",
892
+ "metrics": {
893
+ "MAE": 15.0,
894
+ "Uni-MAE": 25.0,
895
+ "RMSE": 15.0,
896
+ "MAPE": 15.0,
897
+ "R\u00b2": 15.0,
898
+ "SMAPE": 15.0,
899
+ "Uni-Multi": 15.0
900
+ }
901
+ },
902
+ {
903
+ "domain": "Public Info",
904
+ "category": "traditional",
905
+ "dataset": "austin_water",
906
+ "dataset_version": "v1.0",
907
+ "metrics": {
908
+ "MAE": 15.0,
909
+ "Uni-MAE": 25.0,
910
+ "RMSE": 15.0,
911
+ "MAPE": 15.0,
912
+ "R\u00b2": 15.0,
913
+ "SMAPE": 15.0,
914
+ "Uni-Multi": 15.0
915
+ }
916
+ },
917
+ {
918
+ "domain": "Public Info",
919
+ "category": "traditional",
920
+ "dataset": "blue_bikes",
921
+ "dataset_version": "v1.0",
922
+ "metrics": {
923
+ "MAE": 15.0,
924
+ "Uni-MAE": 25.0,
925
+ "RMSE": 15.0,
926
+ "MAPE": 15.0,
927
+ "R\u00b2": 15.0,
928
+ "SMAPE": 15.0,
929
+ "Uni-Multi": 15.0
930
+ }
931
+ },
932
+ {
933
+ "domain": "Public Info",
934
+ "category": "traditional",
935
+ "dataset": "cursor-tabs",
936
+ "dataset_version": "v1.0",
937
+ "metrics": {
938
+ "MAE": 15.0,
939
+ "Uni-MAE": 25.0,
940
+ "RMSE": 15.0,
941
+ "MAPE": 15.0,
942
+ "R\u00b2": 15.0,
943
+ "SMAPE": 15.0,
944
+ "Uni-Multi": 15.0
945
+ }
946
+ },
947
+ {
948
+ "domain": "Public Info",
949
+ "category": "traditional",
950
+ "dataset": "mn_interstate",
951
+ "dataset_version": "v1.0",
952
+ "metrics": {
953
+ "MAE": 15.0,
954
+ "Uni-MAE": 25.0,
955
+ "RMSE": 15.0,
956
+ "MAPE": 15.0,
957
+ "R\u00b2": 15.0,
958
+ "SMAPE": 15.0,
959
+ "Uni-Multi": 15.0
960
+ }
961
+ },
962
+ {
963
+ "domain": "Public Info",
964
+ "category": "traditional",
965
+ "dataset": "mta_ridership",
966
+ "dataset_version": "v1.0",
967
+ "metrics": {
968
+ "MAE": 15.0,
969
+ "Uni-MAE": 25.0,
970
+ "RMSE": 15.0,
971
+ "MAPE": 15.0,
972
+ "R\u00b2": 15.0,
973
+ "SMAPE": 15.0,
974
+ "Uni-Multi": 15.0
975
+ }
976
+ },
977
+ {
978
+ "domain": "Public Info",
979
+ "category": "traditional",
980
+ "dataset": "paris_mobility",
981
+ "dataset_version": "v1.0",
982
+ "metrics": {
983
+ "MAE": 15.0,
984
+ "Uni-MAE": 25.0,
985
+ "RMSE": 15.0,
986
+ "MAPE": 15.0,
987
+ "R\u00b2": 15.0,
988
+ "SMAPE": 15.0,
989
+ "Uni-Multi": 15.0
990
+ }
991
+ },
992
+ {
993
+ "domain": "Public Info",
994
+ "category": "traditional",
995
+ "dataset": "lyft",
996
+ "dataset_version": "v1.0",
997
+ "metrics": {
998
+ "MAE": 15.0,
999
+ "Uni-MAE": 25.0,
1000
+ "RMSE": 15.0,
1001
+ "MAPE": 15.0,
1002
+ "R\u00b2": 15.0,
1003
+ "SMAPE": 15.0,
1004
+ "Uni-Multi": 15.0
1005
+ }
1006
+ },
1007
+ {
1008
+ "domain": "Public Info",
1009
+ "category": "traditional",
1010
+ "dataset": "uber",
1011
+ "dataset_version": "v1.0",
1012
+ "metrics": {
1013
+ "MAE": 15.0,
1014
+ "Uni-MAE": 25.0,
1015
+ "RMSE": 15.0,
1016
+ "MAPE": 15.0,
1017
+ "R\u00b2": 15.0,
1018
+ "SMAPE": 15.0,
1019
+ "Uni-Multi": 15.0
1020
+ }
1021
+ },
1022
+ {
1023
+ "domain": "Public Info",
1024
+ "category": "traditional",
1025
+ "dataset": "tac",
1026
+ "dataset_version": "v1.0",
1027
+ "metrics": {
1028
+ "MAE": 15.0,
1029
+ "Uni-MAE": 25.0,
1030
+ "RMSE": 15.0,
1031
+ "MAPE": 15.0,
1032
+ "R\u00b2": 15.0,
1033
+ "SMAPE": 15.0,
1034
+ "Uni-Multi": 15.0
1035
+ }
1036
+ },
1037
+ {
1038
+ "domain": "Public Info",
1039
+ "category": "traditional",
1040
+ "dataset": "traffic_PeMS",
1041
+ "dataset_version": "v1.0",
1042
+ "metrics": {
1043
+ "MAE": 15.0,
1044
+ "Uni-MAE": 25.0,
1045
+ "RMSE": 15.0,
1046
+ "MAPE": 15.0,
1047
+ "R\u00b2": 15.0,
1048
+ "SMAPE": 15.0,
1049
+ "Uni-Multi": 15.0
1050
+ }
1051
+ },
1052
+ {
1053
+ "domain": "Sales",
1054
+ "category": "traditional",
1055
+ "dataset": "bitcoin_price",
1056
+ "dataset_version": "v1.0",
1057
+ "metrics": {
1058
+ "MAE": 15.0,
1059
+ "Uni-MAE": 25.0,
1060
+ "RMSE": 15.0,
1061
+ "MAPE": 15.0,
1062
+ "R\u00b2": 15.0,
1063
+ "SMAPE": 15.0,
1064
+ "Uni-Multi": 15.0
1065
+ }
1066
+ },
1067
+ {
1068
+ "domain": "Sales",
1069
+ "category": "traditional",
1070
+ "dataset": "blow_molding",
1071
+ "dataset_version": "v1.0",
1072
+ "metrics": {
1073
+ "MAE": 15.0,
1074
+ "Uni-MAE": 25.0,
1075
+ "RMSE": 15.0,
1076
+ "MAPE": 15.0,
1077
+ "R\u00b2": 15.0,
1078
+ "SMAPE": 15.0,
1079
+ "Uni-Multi": 15.0
1080
+ }
1081
+ },
1082
+ {
1083
+ "domain": "Sales",
1084
+ "category": "traditional",
1085
+ "dataset": "gold_prices",
1086
+ "dataset_version": "v1.0",
1087
+ "metrics": {
1088
+ "MAE": 15.0,
1089
+ "Uni-MAE": 25.0,
1090
+ "RMSE": 15.0,
1091
+ "MAPE": 15.0,
1092
+ "R\u00b2": 15.0,
1093
+ "SMAPE": 15.0,
1094
+ "Uni-Multi": 15.0
1095
+ }
1096
+ },
1097
+ {
1098
+ "domain": "Sales",
1099
+ "category": "traditional",
1100
+ "dataset": "pasta_sales",
1101
+ "dataset_version": "v1.0",
1102
+ "metrics": {
1103
+ "MAE": 15.0,
1104
+ "Uni-MAE": 25.0,
1105
+ "RMSE": 15.0,
1106
+ "MAPE": 15.0,
1107
+ "R\u00b2": 15.0,
1108
+ "SMAPE": 15.0,
1109
+ "Uni-Multi": 15.0
1110
+ }
1111
+ },
1112
+ {
1113
+ "domain": "Sales",
1114
+ "category": "traditional",
1115
+ "dataset": "rice_prices",
1116
+ "dataset_version": "v1.0",
1117
+ "metrics": {
1118
+ "MAE": 15.0,
1119
+ "Uni-MAE": 25.0,
1120
+ "RMSE": 15.0,
1121
+ "MAPE": 15.0,
1122
+ "R\u00b2": 15.0,
1123
+ "SMAPE": 15.0,
1124
+ "Uni-Multi": 15.0
1125
+ }
1126
+ },
1127
+ {
1128
+ "domain": "Sales",
1129
+ "category": "traditional",
1130
+ "dataset": "walmart-sales",
1131
+ "dataset_version": "v1.0",
1132
+ "metrics": {
1133
+ "MAE": 15.0,
1134
+ "Uni-MAE": 25.0,
1135
+ "RMSE": 15.0,
1136
+ "MAPE": 15.0,
1137
+ "R\u00b2": 15.0,
1138
+ "SMAPE": 15.0,
1139
+ "Uni-Multi": 15.0
1140
+ }
1141
+ },
1142
+ {
1143
+ "domain": "Scientific",
1144
+ "category": "sequential",
1145
+ "dataset": "ant_csv_out",
1146
+ "dataset_version": "v1.0",
1147
+ "metrics": {
1148
+ "MAE": 15.0,
1149
+ "Uni-MAE": 25.0,
1150
+ "RMSE": 15.0,
1151
+ "MAPE": 15.0,
1152
+ "R\u00b2": 15.0,
1153
+ "SMAPE": 15.0,
1154
+ "Uni-Multi": 15.0
1155
+ }
1156
+ },
1157
+ {
1158
+ "domain": "Scientific",
1159
+ "category": "sequential",
1160
+ "dataset": "hopper_csv_out",
1161
+ "dataset_version": "v1.0",
1162
+ "metrics": {
1163
+ "MAE": 15.0,
1164
+ "Uni-MAE": 25.0,
1165
+ "RMSE": 15.0,
1166
+ "MAPE": 15.0,
1167
+ "R\u00b2": 15.0,
1168
+ "SMAPE": 15.0,
1169
+ "Uni-Multi": 15.0
1170
+ }
1171
+ },
1172
+ {
1173
+ "domain": "Scientific",
1174
+ "category": "sequential",
1175
+ "dataset": "cheetah_csv_out",
1176
+ "dataset_version": "v1.0",
1177
+ "metrics": {
1178
+ "MAE": 15.0,
1179
+ "Uni-MAE": 25.0,
1180
+ "RMSE": 15.0,
1181
+ "MAPE": 15.0,
1182
+ "R\u00b2": 15.0,
1183
+ "SMAPE": 15.0,
1184
+ "Uni-Multi": 15.0
1185
+ }
1186
+ },
1187
+ {
1188
+ "domain": "Scientific",
1189
+ "category": "sequential",
1190
+ "dataset": "walker2d_csv_out",
1191
+ "dataset_version": "v1.0",
1192
+ "metrics": {
1193
+ "MAE": 15.0,
1194
+ "Uni-MAE": 25.0,
1195
+ "RMSE": 15.0,
1196
+ "MAPE": 15.0,
1197
+ "R\u00b2": 15.0,
1198
+ "SMAPE": 15.0,
1199
+ "Uni-Multi": 15.0
1200
+ }
1201
+ },
1202
+ {
1203
+ "domain": "Scientific",
1204
+ "category": "sequential",
1205
+ "dataset": "spriteworld",
1206
+ "dataset_version": "v1.0",
1207
+ "metrics": {
1208
+ "MAE": 15.0,
1209
+ "Uni-MAE": 25.0,
1210
+ "RMSE": 15.0,
1211
+ "MAPE": 15.0,
1212
+ "R\u00b2": 15.0,
1213
+ "SMAPE": 15.0,
1214
+ "Uni-Multi": 15.0
1215
+ }
1216
+ },
1217
+ {
1218
+ "domain": "Stock",
1219
+ "category": "collections",
1220
+ "dataset": "stock_nasdaqtrader",
1221
+ "dataset_version": "v1.0",
1222
+ "metrics": {
1223
+ "MAE": 15.0,
1224
+ "Uni-MAE": 25.0,
1225
+ "RMSE": 15.0,
1226
+ "MAPE": 15.0,
1227
+ "R\u00b2": 15.0,
1228
+ "SMAPE": 15.0,
1229
+ "Uni-Multi": 15.0
1230
+ }
1231
+ },
1232
+ {
1233
+ "domain": "Text",
1234
+ "category": "sequential",
1235
+ "dataset": "openwebtext_timeseries_csvs",
1236
+ "dataset_version": "v1.0",
1237
+ "metrics": {
1238
+ "MAE": 15.0,
1239
+ "Uni-MAE": 25.0,
1240
+ "RMSE": 15.0,
1241
+ "MAPE": 15.0,
1242
+ "R\u00b2": 15.0,
1243
+ "SMAPE": 15.0,
1244
+ "Uni-Multi": 15.0
1245
+ }
1246
+ },
1247
+ {
1248
+ "domain": "Video",
1249
+ "category": "sequential",
1250
+ "dataset": "KITTI",
1251
+ "dataset_version": "v1.0",
1252
+ "metrics": {
1253
+ "MAE": 15.0,
1254
+ "Uni-MAE": 25.0,
1255
+ "RMSE": 15.0,
1256
+ "MAPE": 15.0,
1257
+ "R\u00b2": 15.0,
1258
+ "SMAPE": 15.0,
1259
+ "Uni-Multi": 15.0
1260
+ }
1261
+ },
1262
+ {
1263
+ "domain": "Web",
1264
+ "category": "traditional",
1265
+ "dataset": "website_visitors",
1266
+ "dataset_version": "v1.0",
1267
+ "metrics": {
1268
+ "MAE": 15.0,
1269
+ "Uni-MAE": 25.0,
1270
+ "RMSE": 15.0,
1271
+ "MAPE": 15.0,
1272
+ "R\u00b2": 15.0,
1273
+ "SMAPE": 15.0,
1274
+ "Uni-Multi": 15.0
1275
+ }
1276
+ },
1277
+ {
1278
+ "domain": "Wikipedia",
1279
+ "category": "collections",
1280
+ "dataset": "wikipedia",
1281
+ "dataset_version": "v1.0",
1282
+ "metrics": {
1283
+ "MAE": 15.0,
1284
+ "Uni-MAE": 25.0,
1285
+ "RMSE": 15.0,
1286
+ "MAPE": 15.0,
1287
+ "R\u00b2": 15.0,
1288
+ "SMAPE": 15.0,
1289
+ "Uni-Multi": 15.0
1290
+ }
1291
+ }
1292
+ ]
src/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MUSED-FM Leaderboard source package
3
+ """
4
+
5
+ from .load_results import (
6
+ load_results_with_metadata,
7
+ create_overall_table,
8
+ get_filter_options,
9
+ get_model_metadata,
10
+ create_model_metadata_display,
11
+ get_overall_summary
12
+ )
13
+
14
+ __all__ = [
15
+ "load_results_with_metadata",
16
+ "create_overall_table",
17
+ "get_filter_options",
18
+ "get_model_metadata",
19
+ "create_model_metadata_display",
20
+ "get_overall_summary"
21
+ ]
src/about.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text constants for MUSED-FM Leaderboard
3
+ """
4
+
5
+ TITLE = """
6
+ <div style="text-align: center;">
7
+ <h1>📊 MUSED-FM Leaderboard</h1>
8
+ <p style="font-size: 18px; color: #666;">Multivariate Time Series Evaluation Dataset for Foundation Models</p>
9
+ </div>
10
+ """
11
+
12
+ INTRODUCTION_TEXT = """
13
+ Welcome to the **MUSED-FM Leaderboard**! This leaderboard provides comprehensive evaluation results for foundation models on multivariate time series forecasting tasks.
14
+
15
+ **MUSED-FM** spans 16 multivariate time series domains and introduces novel synthetic data techniques, comprising 67 billion data points and 2.6 million time series.
16
+
17
+ ### Key Features:
18
+ - **Scale**: 67 billion data points across 2.6 million time series
19
+ - **Domains**: 16 multivariate time series domains
20
+ - **Innovation**: Novel synthetic data techniques
21
+ - **Evaluation**: Comprehensive metrics including MAE, RMSE, MAPE, R², SMAPE, Uni-MAE, and Uni-Multi
22
+
23
+ ### Dataset Structure:
24
+ - **Categories**: Traditional, Sequential, Synthetic, Collections
25
+ - **Domains**: Finance, Health, Energy, Environment, Engineering, and more
26
+ - **Datasets**: 86+ individual time series datasets
27
+
28
+ Use the filters below to explore results by different criteria and compare model performance across various domains and categories.
29
+ """
30
+
31
+ LLM_BENCHMARKS_TEXT = """
32
+ # About MUSED-FM Leaderboard
33
+
34
+ ## Dataset Overview
35
+
36
+ **MUSED-FM** (Multivariate Time Series Evaluation Dataset for Foundation Models) is a comprehensive benchmark for evaluating foundation models on multivariate time series forecasting tasks.
37
+
38
+ ### Key Features:
39
+ - **Scale**: 67 billion data points across 2.6 million time series
40
+ - **Domains**: 16 multivariate time series domains
41
+ - **Innovation**: Novel synthetic data techniques
42
+ - **Evaluation**: Comprehensive metrics including MAE, RMSE, MAPE, R², SMAPE, Uni-MAE, and Uni-Multi
43
+
44
+ ### Dataset Structure:
45
+ - **Categories**: Traditional, Sequential, Synthetic, Collections
46
+ - **Domains**: Finance, Health, Energy, Environment, Engineering, and more
47
+ - **Datasets**: 86+ individual time series datasets
48
+
49
+ ## Evaluation Metrics
50
+
51
+ ### Standard Metrics:
52
+ - **MAE** (Mean Absolute Error): Average absolute difference between predicted and actual values
53
+ - **RMSE** (Root Mean Square Error): Square root of average squared differences
54
+ - **MAPE** (Mean Absolute Percentage Error): Average percentage error
55
+ - **R²** (Coefficient of Determination): Proportion of variance explained
56
+ - **SMAPE** (Symmetric Mean Absolute Percentage Error): Symmetric percentage error
57
+
58
+ ### Novel Metrics:
59
+ - **Uni-MAE**: Unified MAE metric for cross-dataset comparison
60
+ - **Uni-Multi**: Unified multivariate metric for comprehensive evaluation
61
+
62
+ ## Resources
63
+
64
+ ### Dataset Access:
65
+ - **Hugging Face**: [MUSED-FM Dataset](https://huggingface.co/datasets/Synthefy/MUSED-FM)
66
+ - **GitHub Repository**: [MUSED-FM Code](https://github.com/Synthefy/MUSED-FM)
67
+
68
+ ### Citation:
69
+ If you use MUSED-FM in your research, please cite the original paper:
70
+
71
+ ```bibtex
72
+ @article{mused-fm2024,
73
+ title={MUSED-FM: A Multivariate Time Series Evaluation Dataset for Foundation Models},
74
+ author={Synthefy Research Team},
75
+ journal={arXiv preprint},
76
+ year={2024}
77
+ }
78
+ ```
79
+
80
+ ## Contact & Support
81
+
82
+ For questions about the dataset or leaderboard:
83
+ - **Issues**: Report issues on the [GitHub repository](https://github.com/Synthefy/MUSED-FM)
84
+ - **Discussions**: Join discussions on [Hugging Face](https://huggingface.co/datasets/Synthefy/MUSED-FM)
85
+
86
+ ## Leaderboard Information
87
+
88
+ This leaderboard provides:
89
+ - **Real-time Rankings**: Live updates as new submissions are received
90
+ - **Filtered Views**: Explore results by domain, category, and dataset
91
+ - **Model Inspector**: Detailed metadata for each submitted model
92
+ - **Comprehensive Metrics**: Multiple evaluation perspectives
93
+
94
+ The leaderboard aggregates results across all datasets to provide overall model rankings while maintaining the ability to drill down into specific domains and categories.
95
+ """
96
+
97
+ CITATION_BUTTON_LABEL = "📋 Citation"
98
+ CITATION_BUTTON_TEXT = """@article{mused-fm2024,
99
+ title={MUSED-FM: A Multivariate Time Series Evaluation Dataset for Foundation Models},
100
+ author={Synthefy Research Team},
101
+ journal={arXiv preprint},
102
+ year={2024}
103
+ }"""
104
+
105
+ EVALUATION_QUEUE_TEXT = """
106
+ ## Evaluation Queue
107
+
108
+ This section shows the current status of model evaluations in the queue.
109
+ """
src/display/css_html_js.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CSS and styling for MUSED-FM Leaderboard
3
+ """
4
+
5
+ custom_css = """
6
+ /* Custom styling for MUSED-FM Leaderboard */
7
+ .elegant-table {
8
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
9
+ }
10
+
11
+ .markdown-text {
12
+ font-size: 14px;
13
+ line-height: 1.6;
14
+ }
15
+
16
+ .tab-buttons {
17
+ margin-top: 20px;
18
+ }
19
+
20
+ #citation-button {
21
+ font-family: 'Courier New', monospace;
22
+ font-size: 12px;
23
+ }
24
+ """
src/display/utils.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Display utilities and column definitions for MUSED-FM Leaderboard
3
+ """
4
+
5
+ from dataclasses import dataclass
6
+ from typing import List, Dict, Any
7
+ from enum import Enum
8
+
9
+ # Column definitions for model information
10
+ @dataclass
11
+ class ModelInfoColumn:
12
+ name: str
13
+ type: str = "str"
14
+ displayed_by_default: bool = True
15
+ never_hidden: bool = False
16
+ hidden: bool = False
17
+
18
+ # Model information columns
19
+ model_info_columns = [
20
+ ModelInfoColumn("model", "str", True, True, False),
21
+ ModelInfoColumn("organization", "str", True, False, False),
22
+ ModelInfoColumn("submission_date", "str", True, False, False),
23
+ ModelInfoColumn("task", "str", True, False, False),
24
+ ModelInfoColumn("dataset_version", "str", True, False, False),
25
+ ModelInfoColumn("paper_url", "str", False, False, False),
26
+ ModelInfoColumn("code_url", "str", False, False, False),
27
+ ModelInfoColumn("domains", "number", True, False, False),
28
+ ModelInfoColumn("categories", "number", True, False, False),
29
+ ModelInfoColumn("datasets", "number", True, False, False),
30
+ ]
31
+
32
+ # Benchmark columns (metrics)
33
+ BENCHMARK_COLS = [
34
+ "MAE", "Uni-MAE", "RMSE", "MAPE", "R²", "SMAPE", "Uni-Multi"
35
+ ]
36
+
37
+ # Evaluation columns
38
+ EVAL_COLS = [
39
+ "model", "submitter", "submission_date", "domain", "category", "dataset",
40
+ "task", "dataset_version", "paper_url", "code_url"
41
+ ]
42
+
43
+ # Evaluation types
44
+ EVAL_TYPES = ["multivariate_forecasting"]
45
+
46
+ # Model types
47
+ class ModelType(Enum):
48
+ FOUNDATION = "Foundation Model"
49
+ TRADITIONAL = "Traditional"
50
+ NEURAL = "Neural Network"
51
+ TRANSFORMER = "Transformer"
52
+
53
+ # Weight types
54
+ class WeightType(Enum):
55
+ LIGHTWEIGHT = "Lightweight"
56
+ MEDIUM = "Medium"
57
+ HEAVY = "Heavy"
58
+
59
+ # Precision types
60
+ class Precision(Enum):
61
+ FLOAT16 = "FP16"
62
+ FLOAT32 = "FP32"
63
+ MIXED = "Mixed"
64
+
65
+ # Fields function for dataclass
66
+ def fields(cls):
67
+ """Get fields from dataclass"""
68
+ return cls.__dataclass_fields__.values() if hasattr(cls, '__dataclass_fields__') else []
src/envs.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Environment configuration for MUSED-FM Leaderboard
3
+ """
4
+
5
+ import os
6
+
7
+ # API configuration
8
+ class API:
9
+ @staticmethod
10
+ def restart_space(repo_id: str):
11
+ """Restart space functionality"""
12
+ print(f"Restarting space: {repo_id}")
13
+
14
+ # Repository configuration
15
+ REPO_ID = "mused-fm-leaderboard"
16
+ QUEUE_REPO = "mused-fm-queue"
17
+ RESULTS_REPO = "mused-fm-results"
18
+
19
+ # Paths
20
+ EVAL_REQUESTS_PATH = "eval_requests"
21
+ EVAL_RESULTS_PATH = "results"
22
+
23
+ # Token (placeholder)
24
+ TOKEN = os.getenv("HF_TOKEN", "")
src/load_results.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data loading utilities for MUSED-FM Leaderboard
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import pandas as pd
8
+ import numpy as np
9
+ from typing import Dict, List, Any
10
+
11
+
12
+ def load_results_with_metadata() -> List[Dict]:
13
+ """Load results from results directory using metadata.json files"""
14
+ all_results = []
15
+
16
+ # First, try to load from results.json (user submissions)
17
+ results_file = "results.json"
18
+ if os.path.exists(results_file):
19
+ with open(results_file, 'r') as f:
20
+ data = json.load(f)
21
+ return data.get("results", [])
22
+
23
+ # Load from results directory with metadata support
24
+ results_dir = "results"
25
+ if os.path.exists(results_dir):
26
+ for item in os.listdir(results_dir):
27
+ item_path = os.path.join(results_dir, item)
28
+ if os.path.isdir(item_path):
29
+ # Look for metadata.json in each submission folder
30
+ metadata_path = os.path.join(item_path, "metadata.json")
31
+ results_path = None
32
+
33
+ # Find the results file (could be results.json, sample_bulk_submission.json, etc.)
34
+ for file in os.listdir(item_path):
35
+ if file.endswith('.json') and file != 'metadata.json':
36
+ results_path = os.path.join(item_path, file)
37
+ break
38
+
39
+ if os.path.exists(metadata_path) and results_path and os.path.exists(results_path):
40
+ try:
41
+ # Load metadata
42
+ with open(metadata_path, 'r') as f:
43
+ metadata = json.load(f)
44
+
45
+ # Load results
46
+ with open(results_path, 'r') as f:
47
+ results_data = json.load(f)
48
+
49
+ # Process each result entry
50
+ for result in results_data:
51
+ # Override with metadata information
52
+ result["model"] = metadata.get("model", result.get("model", ""))
53
+ result["submitter"] = metadata.get("submitter", result.get("submitter", ""))
54
+ result["submission_date"] = metadata.get("submission_date", result.get("submission_date", ""))
55
+ result["task"] = metadata.get("task", result.get("task", ""))
56
+ result["dataset_version"] = metadata.get("dataset_version", result.get("dataset_version", ""))
57
+ result["paper_url"] = metadata.get("paper_url", result.get("paper_url", ""))
58
+ result["code_url"] = metadata.get("code_url", result.get("code_url", ""))
59
+
60
+ all_results.append(result)
61
+
62
+ except Exception as e:
63
+ print(f"Error loading {item_path}: {e}")
64
+ continue
65
+
66
+ return all_results
67
+
68
+
69
+ def create_overall_table(domain_filter="all", category_filter="all", dataset_filter="all", model_filter=""):
70
+ """Create overall aggregated table with optional filters"""
71
+ results = load_results_with_metadata()
72
+ if not results:
73
+ return pd.DataFrame()
74
+
75
+ # Apply filters
76
+ filtered_results = []
77
+ for result in results:
78
+ # Domain filter
79
+ if domain_filter != "all" and result.get("domain", "") != domain_filter:
80
+ continue
81
+
82
+ # Category filter
83
+ if category_filter != "all" and result.get("category", "") != category_filter:
84
+ continue
85
+
86
+ # Dataset filter
87
+ if dataset_filter != "all" and result.get("dataset", "") != dataset_filter:
88
+ continue
89
+
90
+ # Model filter (case-insensitive partial match)
91
+ if model_filter and model_filter.lower() not in result.get("model", "").lower():
92
+ continue
93
+
94
+ filtered_results.append(result)
95
+
96
+ if not filtered_results:
97
+ return pd.DataFrame()
98
+
99
+ # Group by model and calculate aggregated metrics
100
+ model_stats = {}
101
+ for result in filtered_results:
102
+ model = result["model"]
103
+ if model not in model_stats:
104
+ model_stats[model] = {
105
+ "submitter": result["submitter"],
106
+ "submission_date": result["submission_date"],
107
+ "mae_values": [],
108
+ "uni_mae_values": [],
109
+ "rmse_values": [],
110
+ "mape_values": [],
111
+ "r2_values": [],
112
+ "smape_values": [],
113
+ "uni_multi_values": [],
114
+ "datasets": set(),
115
+ "domains": set(),
116
+ "categories": set(),
117
+ "paper_url": result.get("paper_url", ""),
118
+ "code_url": result.get("code_url", "")
119
+ }
120
+
121
+ metrics = result["metrics"]
122
+ model_stats[model]["mae_values"].append(metrics["MAE"])
123
+ model_stats[model]["uni_mae_values"].append(metrics.get("Uni-MAE", 0))
124
+ model_stats[model]["rmse_values"].append(metrics["RMSE"])
125
+ model_stats[model]["mape_values"].append(metrics["MAPE"])
126
+ model_stats[model]["r2_values"].append(metrics["R²"])
127
+ model_stats[model]["smape_values"].append(metrics["SMAPE"])
128
+ model_stats[model]["uni_multi_values"].append(metrics.get("Uni-Multi", 0))
129
+ model_stats[model]["datasets"].add(result.get("dataset", ""))
130
+ model_stats[model]["domains"].add(result.get("domain", ""))
131
+ model_stats[model]["categories"].add(result.get("category", ""))
132
+
133
+ # Create aggregated table
134
+ table_data = []
135
+ for model, stats in model_stats.items():
136
+ # Calculate aggregated metrics (arithmetic mean for better aggregation)
137
+ avg_mae = np.mean(stats["mae_values"])
138
+ avg_uni_mae = np.mean(stats["uni_mae_values"])
139
+ avg_rmse = np.mean(stats["rmse_values"])
140
+ avg_mape = np.mean(stats["mape_values"])
141
+ avg_r2 = np.mean(stats["r2_values"])
142
+ avg_smape = np.mean(stats["smape_values"])
143
+ avg_uni_multi = np.mean(stats["uni_multi_values"])
144
+
145
+ row = {
146
+ "Model": model,
147
+ "Organization": stats["submitter"],
148
+ "Datasets": len(stats["datasets"]),
149
+ "Domains": len(stats["domains"]),
150
+ "Categories": len(stats["categories"]),
151
+ "MAE": f"{avg_mae:.3f}",
152
+ "Uni-MAE": f"{avg_uni_mae:.3f}",
153
+ "RMSE": f"{avg_rmse:.3f}",
154
+ "MAPE": f"{avg_mape:.1f}%",
155
+ "R²": f"{avg_r2:.3f}",
156
+ "SMAPE": f"{avg_smape:.1f}%",
157
+ "Uni-Multi": f"{avg_uni_multi:.3f}",
158
+ "Submission Date": stats["submission_date"]
159
+ }
160
+ table_data.append(row)
161
+
162
+ # Sort by MAE and add ranks
163
+ table_data.sort(key=lambda x: float(x["MAE"]))
164
+ for i, row in enumerate(table_data):
165
+ row["Rank"] = i + 1
166
+
167
+ return pd.DataFrame(table_data)
168
+
169
+
170
+ def get_filter_options():
171
+ """Get all available filter options"""
172
+ results = load_results_with_metadata()
173
+ if not results:
174
+ return {"domains": [], "categories": [], "datasets": [], "models": []}
175
+
176
+ domains = sorted(list(set([r.get("domain", "") for r in results if r.get("domain", "")])))
177
+ categories = sorted(list(set([r.get("category", "") for r in results if r.get("category", "")])))
178
+ datasets = sorted(list(set([r.get("dataset", "") for r in results if r.get("dataset", "")])))
179
+ models = sorted(list(set([r.get("model", "") for r in results if r.get("model", "")])))
180
+
181
+ return {
182
+ "domains": ["all"] + domains,
183
+ "categories": ["all"] + categories,
184
+ "datasets": ["all"] + datasets,
185
+ "models": models
186
+ }
187
+
188
+
189
+ def get_model_metadata(model_name):
190
+ """Get metadata for a specific model"""
191
+ results = load_results_with_metadata()
192
+ if not results:
193
+ return None
194
+
195
+ # Find the first result for this model to get metadata
196
+ for result in results:
197
+ if result.get("model", "") == model_name:
198
+ return {
199
+ "model": result.get("model", ""),
200
+ "submitter": result.get("submitter", ""),
201
+ "submission_date": result.get("submission_date", ""),
202
+ "task": result.get("task", ""),
203
+ "dataset_version": result.get("dataset_version", ""),
204
+ "paper_url": result.get("paper_url", ""),
205
+ "code_url": result.get("code_url", ""),
206
+ "domains": sorted(list(set([r.get("domain", "") for r in results if r.get("model", "") == model_name and r.get("domain", "")]))),
207
+ "categories": sorted(list(set([r.get("category", "") for r in results if r.get("model", "") == model_name and r.get("category", "")]))),
208
+ "datasets": sorted(list(set([r.get("dataset", "") for r in results if r.get("model", "") == model_name and r.get("dataset", "")]))),
209
+ "total_evaluations": len([r for r in results if r.get("model", "") == model_name])
210
+ }
211
+
212
+ return None
213
+
214
+
215
+ def create_model_metadata_display(selected_model):
216
+ """Create a markdown display for model metadata"""
217
+ if not selected_model:
218
+ return "Select a model to view its metadata."
219
+
220
+ metadata = get_model_metadata(selected_model)
221
+ if not metadata:
222
+ return f"❌ No metadata found for model: {selected_model}"
223
+
224
+ # Create clickable links
225
+ paper_link = f"[📄 Paper]({metadata['paper_url']})" if metadata['paper_url'] else "📄 Paper: Not provided"
226
+ code_link = f"[💻 Code]({metadata['code_url']})" if metadata['code_url'] else "💻 Code: Not provided"
227
+
228
+ metadata_text = f"""
229
+ ## 🔍 Model Metadata: {metadata['model']}
230
+
231
+ **Organization:** {metadata['submitter']}
232
+ **Submission Date:** {metadata['submission_date']}
233
+ **Task:** {metadata['task']}
234
+ **Dataset Version:** {metadata['dataset_version']}
235
+
236
+ **Links:**
237
+ {paper_link}
238
+ {code_link}
239
+
240
+ **Evaluation Coverage:**
241
+ - **Total Evaluations:** {metadata['total_evaluations']}
242
+ - **Domains:** {', '.join(metadata['domains']) if metadata['domains'] else 'None'}
243
+ - **Categories:** {', '.join(metadata['categories']) if metadata['categories'] else 'None'}
244
+ - **Datasets:** {', '.join(metadata['datasets'][:5])}{'...' if len(metadata['datasets']) > 5 else ''} ({len(metadata['datasets'])} total)
245
+ """
246
+
247
+ return metadata_text
248
+
249
+
250
+ def get_overall_summary():
251
+ """Generate summary statistics for the overall view"""
252
+ overall_df = create_overall_table()
253
+
254
+ if overall_df.empty:
255
+ return "No data available."
256
+
257
+ total_models = len(overall_df)
258
+ total_datasets = overall_df['Datasets'].sum()
259
+ total_domains = overall_df['Domains'].sum()
260
+ total_categories = overall_df['Categories'].sum()
261
+
262
+ # Calculate average metrics
263
+ mae_values = [float(x) for x in overall_df['MAE']]
264
+ r2_values = [float(x) for x in overall_df['R²']]
265
+
266
+ avg_mae = np.mean(mae_values)
267
+ best_mae = min(mae_values)
268
+ avg_r2 = np.mean(r2_values)
269
+ best_r2 = max(r2_values)
270
+
271
+ stats_text = f"""
272
+ **Overall Summary:**
273
+ - Total Models: {total_models}
274
+ - Total Dataset Evaluations: {total_datasets}
275
+ - Total Domain Evaluations: {total_domains}
276
+ - Total Category Evaluations: {total_categories}
277
+
278
+ **Performance Metrics:**
279
+ - Average MAE: {avg_mae:.3f}
280
+ - Best MAE: {best_mae:.3f}
281
+ - Average R²: {avg_r2:.3f}
282
+ - Best R²: {best_r2:.3f}
283
+ """
284
+
285
+ return stats_text
src/populate.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data population functions for MUSED-FM Leaderboard
3
+ """
4
+
5
+ import pandas as pd
6
+ from typing import Dict, List, Any, Optional
7
+ from .load_results import load_results_with_metadata, create_overall_table
8
+
9
+ def get_leaderboard_df(results_path: str, requests_path: str, eval_cols: List[str], benchmark_cols: List[str]) -> pd.DataFrame:
10
+ """Get leaderboard dataframe"""
11
+ # Use our existing load_results function
12
+ results = load_results_with_metadata()
13
+ if not results:
14
+ return pd.DataFrame()
15
+
16
+ return create_overall_table()
17
+
18
+ def get_model_info_df(results_path: str, requests_path: str) -> pd.DataFrame:
19
+ """Get model information dataframe"""
20
+ results = load_results_with_metadata()
21
+ if not results:
22
+ return pd.DataFrame()
23
+
24
+ # Extract unique model information
25
+ model_info = {}
26
+ for result in results:
27
+ model = result["model"]
28
+ if model not in model_info:
29
+ model_info[model] = {
30
+ "model": model,
31
+ "organization": result["submitter"],
32
+ "submission_date": result["submission_date"],
33
+ "task": result.get("task", ""),
34
+ "dataset_version": result.get("dataset_version", ""),
35
+ "paper_url": result.get("paper_url", ""),
36
+ "code_url": result.get("code_url", ""),
37
+ "model_type": "Foundation Model", # Default
38
+ "testdata_leakage": "No" # Default
39
+ }
40
+
41
+ return pd.DataFrame(list(model_info.values()))
42
+
43
+ def get_merged_df(leaderboard_df: pd.DataFrame, model_info_df: pd.DataFrame) -> pd.DataFrame:
44
+ """Merge leaderboard and model info dataframes"""
45
+ if leaderboard_df.empty or model_info_df.empty:
46
+ return leaderboard_df
47
+
48
+ # Merge on model name
49
+ merged = pd.merge(leaderboard_df, model_info_df, on="model", how="left")
50
+
51
+ # Add rank column
52
+ if 'MAE' in merged.columns:
53
+ merged['Rank'] = merged['MAE'].rank(method='min').astype(int)
54
+ # Move Rank to front
55
+ cols = ['Rank'] + [col for col in merged.columns if col != 'Rank']
56
+ merged = merged[cols]
57
+
58
+ return merged
59
+
60
+ def get_evaluation_queue_df(requests_path: str, eval_cols: List[str]) -> tuple:
61
+ """Get evaluation queue dataframes"""
62
+ # Return empty dataframes for now
63
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
src/utils.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for MUSED-FM Leaderboard
3
+ """
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ from typing import Dict, List, Any, Optional
8
+
9
+ def norm_sNavie(df: pd.DataFrame) -> pd.DataFrame:
10
+ """Normalize dataframe using naive normalization"""
11
+ # Simple normalization - keep as is for now
12
+ return df
13
+
14
+ def pivot_df(file_path: str, tab_name: str) -> pd.DataFrame:
15
+ """Pivot dataframe from file"""
16
+ try:
17
+ df = pd.read_csv(file_path)
18
+ # Simple pivot - return as is for now
19
+ return df
20
+ except Exception as e:
21
+ print(f"Error reading {file_path}: {e}")
22
+ return pd.DataFrame()
23
+
24
+ def get_grouped_dfs() -> Dict[str, pd.DataFrame]:
25
+ """Get grouped dataframes for different views"""
26
+ from .load_results import load_results_with_metadata, create_overall_table
27
+
28
+ # Load results
29
+ results = load_results_with_metadata()
30
+ if not results:
31
+ return {
32
+ 'domain': pd.DataFrame(),
33
+ 'frequency': pd.DataFrame(),
34
+ 'term_length': pd.DataFrame(),
35
+ 'univariate': pd.DataFrame(),
36
+ 'overall': pd.DataFrame()
37
+ }
38
+
39
+ # Create overall dataframe
40
+ overall_df = create_overall_table()
41
+
42
+ # For now, return the same dataframe for all views
43
+ # In a real implementation, these would be different aggregations
44
+ return {
45
+ 'domain': overall_df.copy(),
46
+ 'frequency': overall_df.copy(),
47
+ 'term_length': overall_df.copy(),
48
+ 'univariate': overall_df.copy(),
49
+ 'overall': overall_df.copy()
50
+ }
51
+
52
+ def pivot_existed_df(df: pd.DataFrame, tab_name: str) -> pd.DataFrame:
53
+ """Pivot existing dataframe"""
54
+ if df.empty:
55
+ return df
56
+
57
+ # Add tab name as a column for identification
58
+ df_copy = df.copy()
59
+ df_copy['tab'] = tab_name
60
+ return df_copy
61
+
62
+ def rename_metrics(df: pd.DataFrame) -> pd.DataFrame:
63
+ """Rename metrics columns"""
64
+ if df.empty:
65
+ return df
66
+
67
+ # Add rank column based on MAE
68
+ if 'MAE' in df.columns:
69
+ df_copy = df.copy()
70
+ df_copy['MASE_Rank'] = df_copy['MAE'].rank(method='min')
71
+ return df_copy
72
+
73
+ return df
74
+
75
+ def format_df(df: pd.DataFrame) -> pd.DataFrame:
76
+ """Format dataframe for display"""
77
+ if df.empty:
78
+ return df
79
+
80
+ df_copy = df.copy()
81
+
82
+ # Format numeric columns
83
+ numeric_cols = ['MAE', 'Uni-MAE', 'RMSE', 'MAPE', 'R²', 'SMAPE', 'Uni-Multi']
84
+ for col in numeric_cols:
85
+ if col in df_copy.columns:
86
+ if col in ['MAPE', 'SMAPE']:
87
+ df_copy[col] = df_copy[col].apply(lambda x: f"{x:.1f}%" if pd.notna(x) else "")
88
+ else:
89
+ df_copy[col] = df_copy[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "")
90
+
91
+ return df_copy