Spaces:

smolagents
/

ml-agent

Running

App Files Files Community

akseljoonas HF Staff commited on Nov 14, 2025

Commit

f92b0c4

1 Parent(s): 6b80d78

dataset creation script

Browse files

Files changed (1) hide show

eval/create_eval_dataset.py +160 -0

eval/create_eval_dataset.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from itertools import product
+from datasets import Dataset
+# Task templates (excluding Very hard difficulty)
+tasks = [
+    {
+        "task": "Evaluate models {M} on benchmarks {B}",
+        "difficulty": "Easy",
+        "category": "Evaluation",
+        "params": ["M", "B"],
+    },
+    {
+        "task": "Train models {M} on datasets {D} evaluating them on benchmarks {B}",
+        "difficulty": "Medium",
+        "category": "Training",
+        "params": ["M", "D", "B"],
+    },
+    {
+        "task": "Run an ablation for hyperparameter {P} for model {M} on dataset {D}",
+        "difficulty": "Hard",
+        "category": "Ablation",
+        "params": ["P", "M", "D"],
+    },
+    {
+        "task": "Generate completions with model {M} on benchmarks {B} using engine {E}",
+        "difficulty": "Medium",
+        "category": "Generation",
+        "params": ["M", "B", "E"],
+    },
+    # {
+    #     "task": "Merge models {M} using linear averaging to find the best result on benchmarks {B}",
+    #     "difficulty": "Hard",
+    #     "category": "Model Merging",
+    #     "params": ["M", "B"],
+    # },
+    {
+        "task": "Decontaminate dataset {D} against benchmarks {B}",
+        "difficulty": "Hard",
+        "category": "Data Processing",
+        "params": ["D", "B"],
+    },
+    {
+        "task": "Format dataset {D} for compatibility with framework {F} on task {T}",
+        "difficulty": "Easy",
+        "category": "Data Formatting",
+        "params": ["D", "F", "T"],
+    },
+]
+# Parameter values
+values = {
+    "M": [
+        "Qwen/Qwen3-4B-Instruct-2507",
+        "openai/gpt-oss-20b",
+        "gpt-4o-mini",
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        "anthropic's latest model",
+    ],
+    "B": [
+        "Idavidrein/gpqa",
+        "HuggingFaceH4/MATH-500",
+        "lighteval/SimpleQA",
+        "TIGER-Lab/MMLU-Pro",
+    ],
+    "D": [
+        "HuggingFaceH4/multi_turn_if",
+        "HuggingFaceH4/ultrachat_200k",
+        "HuggingFaceH4/AceReason-1.1-SFT config: math_no_think",
+    ],
+    "E": [
+        "vllm",
+        "sglang",
+    ],
+    "F": [
+        "trl",
+        "axolotl",
+        "verl",
+    ],
+    "P": [
+        "learning_rate",
+        "batch_size",
+        "num_epochs",
+    ],
+    "T": [
+        "SFT",
+        "GRPO",
+    ],
+}
+# Task-specific instance limits
+# For each task, specify which parameter(s) to pivot on and how many instances per pivot combination
+# pivot can be a single parameter string or a list of parameters
+task_limits = [
+    {"pivot": "B", "instances_per_pivot": 1},  # Task 0: 1 instance per
+    {"pivot": ["M", "B"], "instances_per_pivot": 3},  # Task 1: 3 instances per model
+    {"pivot": ["P", "D"], "instances_per_pivot": 3},  # Task 2:
+    {"pivot": "E", "instances_per_pivot": 2},  # Task 3: 2 instances per benchmark
+    # {"pivot": "M", "instances_per_pivot": 2},  # Task 4
+    {"pivot": "D", "instances_per_pivot": 2},  # Task 5: 2 instances per dataset
+    {"pivot": ["D", "F", "T"], "instances_per_pivot": 2},  # Task 6:
+]
+def main():
+    eval_data = []
+    for task_idx, task_dict in enumerate(tasks):
+        template = task_dict["task"]
+        params = task_dict["params"]
+        limit_config = task_limits[task_idx]
+        pivot_params = limit_config["pivot"]
+        instances_per_pivot = limit_config["instances_per_pivot"]
+        # Normalize pivot to list
+        if isinstance(pivot_params, str):
+            pivot_params = [pivot_params]
+        # Get all combinations of pivot values
+        pivot_param_values = [values[p] for p in pivot_params]
+        pivot_combinations = product(*pivot_param_values)
+        # For each pivot combination, generate limited instances
+        for pivot_combo in pivot_combinations:
+            # Get combinations of other (non-pivot) parameters
+            other_params = [p for p in params if p not in pivot_params]
+            other_param_values = [values[p] for p in other_params]
+            other_combinations = list(product(*other_param_values))
+            # Limit to specified number of instances per pivot combination
+            limited_combinations = other_combinations[:instances_per_pivot]
+            # Generate instances
+            for combo in limited_combinations:
+                # Build kwargs with pivot values and other values
+                kwargs = dict(zip(pivot_params, pivot_combo))
+                kwargs.update(dict(zip(other_params, combo)))
+                concrete_task = template.format(**kwargs)
+                eval_data.append(
+                    {
+                        "task": concrete_task,
+                        "difficulty": task_dict["difficulty"],
+                        "category": task_dict["category"],
+                    }
+                )
+    print(f"Generated {len(eval_data)} instances from {len(tasks)} templates")
+    dataset = Dataset.from_list(eval_data)
+    print(f"\nDataset: {len(dataset)} rows")
+    print(f"Sample: {dataset[0]['task']}")
+    dataset.push_to_hub("akseljoonas/qyestions", private=False)
+    print("\n✓ Pushed to akseljoonas/qyestions")
+if __name__ == "__main__":
+    main()