akseljoonas HF Staff commited on
Commit
f92b0c4
·
1 Parent(s): 6b80d78

dataset creation script

Browse files
Files changed (1) hide show
  1. eval/create_eval_dataset.py +160 -0
eval/create_eval_dataset.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import product
2
+
3
+ from datasets import Dataset
4
+
5
+ # Task templates (excluding Very hard difficulty)
6
+ tasks = [
7
+ {
8
+ "task": "Evaluate models {M} on benchmarks {B}",
9
+ "difficulty": "Easy",
10
+ "category": "Evaluation",
11
+ "params": ["M", "B"],
12
+ },
13
+ {
14
+ "task": "Train models {M} on datasets {D} evaluating them on benchmarks {B}",
15
+ "difficulty": "Medium",
16
+ "category": "Training",
17
+ "params": ["M", "D", "B"],
18
+ },
19
+ {
20
+ "task": "Run an ablation for hyperparameter {P} for model {M} on dataset {D}",
21
+ "difficulty": "Hard",
22
+ "category": "Ablation",
23
+ "params": ["P", "M", "D"],
24
+ },
25
+ {
26
+ "task": "Generate completions with model {M} on benchmarks {B} using engine {E}",
27
+ "difficulty": "Medium",
28
+ "category": "Generation",
29
+ "params": ["M", "B", "E"],
30
+ },
31
+ # {
32
+ # "task": "Merge models {M} using linear averaging to find the best result on benchmarks {B}",
33
+ # "difficulty": "Hard",
34
+ # "category": "Model Merging",
35
+ # "params": ["M", "B"],
36
+ # },
37
+ {
38
+ "task": "Decontaminate dataset {D} against benchmarks {B}",
39
+ "difficulty": "Hard",
40
+ "category": "Data Processing",
41
+ "params": ["D", "B"],
42
+ },
43
+ {
44
+ "task": "Format dataset {D} for compatibility with framework {F} on task {T}",
45
+ "difficulty": "Easy",
46
+ "category": "Data Formatting",
47
+ "params": ["D", "F", "T"],
48
+ },
49
+ ]
50
+
51
+ # Parameter values
52
+ values = {
53
+ "M": [
54
+ "Qwen/Qwen3-4B-Instruct-2507",
55
+ "openai/gpt-oss-20b",
56
+ "gpt-4o-mini",
57
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
58
+ "anthropic's latest model",
59
+ ],
60
+ "B": [
61
+ "Idavidrein/gpqa",
62
+ "HuggingFaceH4/MATH-500",
63
+ "lighteval/SimpleQA",
64
+ "TIGER-Lab/MMLU-Pro",
65
+ ],
66
+ "D": [
67
+ "HuggingFaceH4/multi_turn_if",
68
+ "HuggingFaceH4/ultrachat_200k",
69
+ "HuggingFaceH4/AceReason-1.1-SFT config: math_no_think",
70
+ ],
71
+ "E": [
72
+ "vllm",
73
+ "sglang",
74
+ ],
75
+ "F": [
76
+ "trl",
77
+ "axolotl",
78
+ "verl",
79
+ ],
80
+ "P": [
81
+ "learning_rate",
82
+ "batch_size",
83
+ "num_epochs",
84
+ ],
85
+ "T": [
86
+ "SFT",
87
+ "GRPO",
88
+ ],
89
+ }
90
+
91
+ # Task-specific instance limits
92
+ # For each task, specify which parameter(s) to pivot on and how many instances per pivot combination
93
+ # pivot can be a single parameter string or a list of parameters
94
+ task_limits = [
95
+ {"pivot": "B", "instances_per_pivot": 1}, # Task 0: 1 instance per
96
+ {"pivot": ["M", "B"], "instances_per_pivot": 3}, # Task 1: 3 instances per model
97
+ {"pivot": ["P", "D"], "instances_per_pivot": 3}, # Task 2:
98
+ {"pivot": "E", "instances_per_pivot": 2}, # Task 3: 2 instances per benchmark
99
+ # {"pivot": "M", "instances_per_pivot": 2}, # Task 4
100
+ {"pivot": "D", "instances_per_pivot": 2}, # Task 5: 2 instances per dataset
101
+ {"pivot": ["D", "F", "T"], "instances_per_pivot": 2}, # Task 6:
102
+ ]
103
+
104
+
105
+ def main():
106
+ eval_data = []
107
+
108
+ for task_idx, task_dict in enumerate(tasks):
109
+ template = task_dict["task"]
110
+ params = task_dict["params"]
111
+ limit_config = task_limits[task_idx]
112
+
113
+ pivot_params = limit_config["pivot"]
114
+ instances_per_pivot = limit_config["instances_per_pivot"]
115
+
116
+ # Normalize pivot to list
117
+ if isinstance(pivot_params, str):
118
+ pivot_params = [pivot_params]
119
+
120
+ # Get all combinations of pivot values
121
+ pivot_param_values = [values[p] for p in pivot_params]
122
+ pivot_combinations = product(*pivot_param_values)
123
+
124
+ # For each pivot combination, generate limited instances
125
+ for pivot_combo in pivot_combinations:
126
+ # Get combinations of other (non-pivot) parameters
127
+ other_params = [p for p in params if p not in pivot_params]
128
+ other_param_values = [values[p] for p in other_params]
129
+ other_combinations = list(product(*other_param_values))
130
+
131
+ # Limit to specified number of instances per pivot combination
132
+ limited_combinations = other_combinations[:instances_per_pivot]
133
+
134
+ # Generate instances
135
+ for combo in limited_combinations:
136
+ # Build kwargs with pivot values and other values
137
+ kwargs = dict(zip(pivot_params, pivot_combo))
138
+ kwargs.update(dict(zip(other_params, combo)))
139
+
140
+ concrete_task = template.format(**kwargs)
141
+ eval_data.append(
142
+ {
143
+ "task": concrete_task,
144
+ "difficulty": task_dict["difficulty"],
145
+ "category": task_dict["category"],
146
+ }
147
+ )
148
+
149
+ print(f"Generated {len(eval_data)} instances from {len(tasks)} templates")
150
+
151
+ dataset = Dataset.from_list(eval_data)
152
+ print(f"\nDataset: {len(dataset)} rows")
153
+ print(f"Sample: {dataset[0]['task']}")
154
+
155
+ dataset.push_to_hub("akseljoonas/qyestions", private=False)
156
+ print("\n✓ Pushed to akseljoonas/qyestions")
157
+
158
+
159
+ if __name__ == "__main__":
160
+ main()