akseljoonas HF Staff commited on
Commit
8541221
·
1 Parent(s): 158d846

generated, filled in and verfied 250 eval questions

Browse files
eval/amp_batch_solve.py DELETED
@@ -1,106 +0,0 @@
1
- import asyncio
2
- import json
3
- import os
4
- from pathlib import Path
5
- import threading
6
-
7
- from amp_sdk import AmpOptions, execute
8
-
9
- # Thread-safe file writing
10
- file_lock = threading.Lock()
11
-
12
-
13
- async def solve_task(
14
- question: str, difficulty: str, task_idx: int, total: int, semaphore: asyncio.Semaphore
15
- ) -> dict:
16
- """Solve a single task using Amp SDK."""
17
- async with semaphore:
18
- print(f"[{task_idx}/{total}] Starting: {question[:60]}...")
19
-
20
- messages = []
21
- solution = None
22
-
23
- try:
24
- async for message in execute(
25
- question,
26
- AmpOptions(
27
- cwd=os.getcwd(),
28
- visibility="workspace",
29
- dangerously_allow_all=True,
30
- ),
31
- ):
32
- messages.append(message.model_dump())
33
-
34
- # Extract the final text response as solution
35
- if message.type == "assistant":
36
- content = message.message.get("content", [])
37
- for item in content:
38
- if isinstance(item, dict) and item.get("type") == "text":
39
- solution = item.get("text")
40
- elif message.type == "result":
41
- if message.result:
42
- solution = message.result
43
-
44
- print(f"[{task_idx}/{total}] ✓ Done: {question[:60]}...")
45
- return {
46
- "question": question,
47
- "difficulty": difficulty,
48
- "solution": solution,
49
- "messages": messages,
50
- }
51
- except Exception as e:
52
- print(f"[{task_idx}/{total}] ✗ Error: {e}")
53
- return {
54
- "question": question,
55
- "difficulty": difficulty,
56
- "solution": None,
57
- "messages": messages,
58
- "error": str(e),
59
- }
60
-
61
-
62
- def write_result(output_path: Path, result: dict):
63
- """Thread-safe write to output file."""
64
- with file_lock:
65
- with open(output_path, "a") as f:
66
- f.write(json.dumps(result) + "\n")
67
-
68
-
69
- async def main():
70
- # Load tasks
71
- tasks_path = Path(__file__).parent / "generated_tasks_with_difficulty.json"
72
- with open(tasks_path) as f:
73
- tasks = json.load(f)
74
-
75
- # Output file - clear it first
76
- output_path = Path(__file__).parent / "solved_tasks.jsonl"
77
- output_path.write_text("")
78
-
79
- # Semaphore to limit concurrency
80
- max_concurrent = 20
81
- semaphore = asyncio.Semaphore(max_concurrent)
82
-
83
- total = len(tasks)
84
- print(f"Processing {total} tasks with {max_concurrent} concurrent agents...")
85
-
86
- async def process_and_save(question: str, difficulty: str, idx: int):
87
- result = await solve_task(question, difficulty, idx, total, semaphore)
88
- write_result(output_path, result)
89
- return result
90
-
91
- # Create all tasks
92
- coroutines = [
93
- process_and_save(question, difficulty, i + 1)
94
- for i, (question, difficulty) in enumerate(tasks.items())
95
- ]
96
-
97
- # Run all concurrently (semaphore limits actual parallelism)
98
- results = await asyncio.gather(*coroutines, return_exceptions=True)
99
-
100
- successful = sum(1 for r in results if isinstance(r, dict) and "error" not in r)
101
- print(f"\nCompleted: {successful}/{total} successful")
102
- print(f"Results saved to {output_path}")
103
-
104
-
105
- if __name__ == "__main__":
106
- asyncio.run(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/amp_solve.py DELETED
@@ -1,31 +0,0 @@
1
- import asyncio
2
- import os
3
-
4
- from amp_sdk import AmpOptions, execute
5
-
6
- prompt = """
7
- what account am I logged in as?
8
- """
9
-
10
-
11
- async def main():
12
- # Use the toolbox directory to share tools with Amp
13
- toolbox_dir = os.path.join(os.getcwd(), "toolbox")
14
- messages = []
15
- async for message in execute(
16
- prompt,
17
- AmpOptions(
18
- cwd=os.getcwd(),
19
- toolbox=toolbox_dir,
20
- visibility="workspace",
21
- dangerously_allow_all=True,
22
- ),
23
- ):
24
- messages.append(message)
25
-
26
- for msg in messages:
27
- print(msg.model_dump_json(indent=2))
28
-
29
-
30
- if __name__ == "__main__":
31
- asyncio.run(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/check_completeness.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Minimal script to check if tasks in solved_tasks.jsonl were fully completed and verified.
4
+ Uses an LLM to assess completion status and adds the result to each row.
5
+ """
6
+
7
+ import argparse
8
+ import json
9
+ import sys
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+
12
+ import litellm
13
+ from dotenv import load_dotenv
14
+ from pydantic import BaseModel
15
+
16
+ load_dotenv()
17
+
18
+
19
+ class CompletionCheck(BaseModel):
20
+ reasoning: str
21
+ completed: bool
22
+ verified: bool
23
+
24
+
25
+ PROMPT = """You are evaluating whether an AI agent fully completed a task AND verified its completion.
26
+
27
+ Task: {question}
28
+
29
+ Agent's final answer: {solution}
30
+
31
+ Agent's trace (tool calls and responses):
32
+ {trace}
33
+
34
+ Evaluate:
35
+ 1. **completed**: Did the agent actually complete the task? (not just explain what could be done, but actually do it)
36
+ 2. **verified**: Did the agent verify/confirm that the task was completed correctly? (e.g., checked output, validated results, confirmed success)
37
+
38
+ Be strict:
39
+ - If the agent asked for more information or said "please provide...", it's NOT completed.
40
+ - If the agent only explained how to do something but didn't do it, it's NOT completed.
41
+ - If the agent just made a plan of how to complete it but didn't do it, it's NOT completed.
42
+ - If there's an error in the trace and no recovery, it's NOT completed.
43
+ - If the agent didn't check/confirm the code/command completed succesfully or the result is correct somehow, it's NOT verified.
44
+
45
+ Return JSON with: completed (bool), verified (bool), reasoning (brief explanation)."""
46
+
47
+
48
+ def format_trace(messages: list) -> str:
49
+ """Format messages trace for the prompt."""
50
+ if not messages:
51
+ return "(No trace)"
52
+
53
+ parts = []
54
+ for msg in messages:
55
+ role = msg.get("role", "unknown")
56
+ if role == "system":
57
+ continue
58
+
59
+ content = msg.get("content", "")
60
+ tool_calls = msg.get("tool_calls", [])
61
+
62
+ if tool_calls:
63
+ for tc in tool_calls:
64
+ if isinstance(tc, dict) and "function" in tc:
65
+ name = tc["function"].get("name", "?")
66
+ parts.append(f"[TOOL CALL] {name}")
67
+
68
+ if content:
69
+ # Truncate long content
70
+ if len(content) > 5000:
71
+ content = content[:4000] + "..." + content[-1000:]
72
+ parts.append(f"[{role.upper()}] {content}")
73
+
74
+ return "\n".join(parts) if parts else "(Empty trace)"
75
+
76
+
77
+ def check_row(row: dict, model: str) -> CompletionCheck | None:
78
+ """Check if a single task was completed and verified."""
79
+ prompt = PROMPT.format(
80
+ question=row["question"],
81
+ solution=row.get("solution", "(No solution)"),
82
+ trace=format_trace(row.get("messages", [])),
83
+ )
84
+
85
+ try:
86
+ response = litellm.completion(
87
+ model=model,
88
+ messages=[{"role": "user", "content": prompt}],
89
+ response_format=CompletionCheck,
90
+ timeout=60,
91
+ )
92
+ return CompletionCheck.model_validate_json(response.choices[0].message.content)
93
+ except Exception as e:
94
+ print(f"Error: {e}", file=sys.stderr)
95
+ return None
96
+
97
+
98
+ def main():
99
+ parser = argparse.ArgumentParser(description="Check task completion status")
100
+ parser.add_argument("--infile", type=str, default="eval/solved_tasks.jsonl")
101
+ parser.add_argument(
102
+ "--outfile", type=str, default="eval/solved_tasks_checked.jsonl"
103
+ )
104
+ parser.add_argument(
105
+ "--model", type=str, default="anthropic/claude-sonnet-4-5-20250929"
106
+ )
107
+ parser.add_argument("--max-concurrent", type=int, default=30)
108
+ args = parser.parse_args()
109
+
110
+ # Load data
111
+ print(f"Loading {args.infile}...")
112
+ rows = []
113
+ with open(args.infile) as f:
114
+ for line in f:
115
+ rows.append(json.loads(line))
116
+ print(f"Loaded {len(rows)} rows")
117
+
118
+ # Process in parallel
119
+ print(f"Checking completion with {args.model}...")
120
+ with ThreadPoolExecutor(max_workers=args.max_concurrent) as executor:
121
+ futures = {
122
+ executor.submit(check_row, row, args.model): i for i, row in enumerate(rows)
123
+ }
124
+ results = [None] * len(rows)
125
+
126
+ for future in as_completed(futures):
127
+ idx = futures[future]
128
+ results[idx] = future.result()
129
+ print(
130
+ f"Done: {sum(1 for r in results if r is not None)}/{len(rows)}",
131
+ end="\r",
132
+ )
133
+
134
+ print()
135
+
136
+ # Merge results
137
+ output_rows = []
138
+ for row, result in zip(rows, results):
139
+ if result:
140
+ row["task_completed"] = result.completed
141
+ row["task_verified"] = result.verified
142
+ row["completion_reasoning"] = result.reasoning
143
+ else:
144
+ row["task_completed"] = None
145
+ row["task_verified"] = None
146
+ row["completion_reasoning"] = "Error during check"
147
+ output_rows.append(row)
148
+
149
+ # Write output
150
+ print(f"Writing to {args.outfile}...")
151
+ with open(args.outfile, "w") as f:
152
+ for row in output_rows:
153
+ f.write(json.dumps(row, default=str) + "\n")
154
+
155
+ # Summary
156
+ completed = sum(1 for r in results if r and r.completed)
157
+ verified = sum(1 for r in results if r and r.verified)
158
+ print("\nSummary:")
159
+ print(f" Completed: {completed}/{len(rows)}")
160
+ print(f" Verified: {verified}/{len(rows)}")
161
+
162
+
163
+ if __name__ == "__main__":
164
+ main()
eval/claude_batch_solve.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import threading
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from claude_agent_sdk import (
9
+ AssistantMessage,
10
+ ClaudeAgentOptions,
11
+ ResultMessage,
12
+ SystemMessage,
13
+ TextBlock,
14
+ ToolResultBlock,
15
+ ToolUseBlock,
16
+ UserMessage,
17
+ query,
18
+ )
19
+ from dotenv import load_dotenv
20
+
21
+ load_dotenv()
22
+
23
+ # Thread-safe file writing
24
+ file_lock = threading.Lock()
25
+
26
+
27
+ def convert_message_to_chat_format(message: Any) -> dict | None:
28
+ """Convert SDK message to standard chat format with role/content/tool_calls."""
29
+
30
+ if isinstance(message, SystemMessage):
31
+ # Extract tools list from init data for system message
32
+ if message.subtype == "init":
33
+ tools = message.data.get("tools", [])
34
+ tools_desc = "\n".join(f"- {tool}" for tool in tools)
35
+ return {
36
+ "role": "system",
37
+ "content": f"You are a helpful assistant with access to the following tools:\n{tools_desc}",
38
+ }
39
+ return None
40
+
41
+ elif isinstance(message, AssistantMessage):
42
+ text_content = ""
43
+ tool_calls = []
44
+
45
+ for block in message.content:
46
+ if isinstance(block, TextBlock):
47
+ text_content += block.text
48
+ elif isinstance(block, ToolUseBlock):
49
+ tool_calls.append(
50
+ {
51
+ "id": block.id,
52
+ "function": {
53
+ "name": block.name,
54
+ "arguments": block.input,
55
+ },
56
+ }
57
+ )
58
+
59
+ result = {"role": "assistant", "content": text_content}
60
+ if tool_calls:
61
+ result["tool_calls"] = tool_calls
62
+ return result
63
+
64
+ elif isinstance(message, UserMessage):
65
+ # UserMessage can contain tool results or text
66
+ if isinstance(message.content, str):
67
+ return {"role": "user", "content": message.content}
68
+ elif isinstance(message.content, list):
69
+ # Check for tool results
70
+ tool_results = []
71
+ text_content = ""
72
+ for block in message.content:
73
+ if isinstance(block, ToolResultBlock):
74
+ # Format tool result content
75
+ if isinstance(block.content, str):
76
+ content = block.content
77
+ elif isinstance(block.content, list):
78
+ content = json.dumps(block.content)
79
+ else:
80
+ content = str(block.content) if block.content else ""
81
+
82
+ tool_results.append(
83
+ {
84
+ "tool_use_id": block.tool_use_id,
85
+ "content": content,
86
+ "is_error": block.is_error,
87
+ }
88
+ )
89
+ elif isinstance(block, TextBlock):
90
+ text_content += block.text
91
+
92
+ if tool_results:
93
+ return {
94
+ "role": "user",
95
+ "content": f"<tool_response>\n{json.dumps(tool_results, indent=2)}\n</tool_response>",
96
+ }
97
+ else:
98
+ return {"role": "user", "content": text_content}
99
+ return None
100
+
101
+ elif isinstance(message, ResultMessage):
102
+ # ResultMessage is metadata, not a conversation message
103
+ return None
104
+
105
+ return None
106
+
107
+
108
+ async def solve_task(
109
+ question: str,
110
+ difficulty: str,
111
+ task_idx: int,
112
+ total: int,
113
+ semaphore: asyncio.Semaphore,
114
+ ) -> dict:
115
+ """Solve a single task using Claude Agent SDK."""
116
+ async with semaphore:
117
+ print(f"[{task_idx}/{total}] Starting: {question[:60]}...")
118
+
119
+ messages = []
120
+ solution = None
121
+
122
+ try:
123
+ async for message in query(
124
+ prompt=question,
125
+ options=ClaudeAgentOptions(
126
+ cwd=os.getcwd(),
127
+ permission_mode="bypassPermissions",
128
+ disallowed_tools=["Write", "Edit", "Bash", "Glob", "Grep"],
129
+ mcp_servers={
130
+ "huggingface": {
131
+ "type": "http",
132
+ "url": "https://huggingface.co/mcp",
133
+ "headers": {
134
+ "Authorization": f"Bearer {os.environ['HF_TOKEN']}"
135
+ },
136
+ }
137
+ },
138
+ ),
139
+ ):
140
+ # Convert to chat format and append if valid
141
+ chat_msg = convert_message_to_chat_format(message)
142
+ if chat_msg:
143
+ messages.append(chat_msg)
144
+
145
+ # Extract text from assistant messages
146
+ if isinstance(message, AssistantMessage):
147
+ for block in message.content:
148
+ if isinstance(block, TextBlock):
149
+ solution = block.text
150
+ # Check for result messages
151
+ elif isinstance(message, ResultMessage):
152
+ if message.is_error:
153
+ print(f"[{task_idx}/{total}] ✗ Agent error: {message.subtype}")
154
+ return {
155
+ "question": question,
156
+ "difficulty": difficulty,
157
+ "solution": None,
158
+ "messages": messages,
159
+ "error": f"Agent error: {message.subtype}",
160
+ }
161
+ elif message.result:
162
+ solution = message.result
163
+
164
+ print(f"[{task_idx}/{total}] ✓ Done: {question[:60]}...")
165
+ return {
166
+ "question": question,
167
+ "difficulty": difficulty,
168
+ "solution": solution,
169
+ "messages": messages,
170
+ "error": None,
171
+ }
172
+ except Exception as e:
173
+ print(f"[{task_idx}/{total}] ✗ Error: {e}")
174
+ return {
175
+ "question": question,
176
+ "difficulty": difficulty,
177
+ "solution": None,
178
+ "messages": messages,
179
+ "error": str(e),
180
+ }
181
+
182
+
183
+ def write_result(output_path: Path, result: dict):
184
+ """Thread-safe write to output file."""
185
+ with file_lock:
186
+ with open(output_path, "a") as f:
187
+ f.write(json.dumps(result) + "\n")
188
+
189
+
190
+ async def main():
191
+ # Load tasks from filled_tasks.jsonl
192
+ tasks_path = Path(__file__).parent / "filled_tasks.jsonl"
193
+ tasks = []
194
+ with open(tasks_path) as f:
195
+ for line in f:
196
+ tasks.append(json.loads(line))
197
+
198
+ # Output file - clear it first
199
+ output_path = Path(__file__).parent / "solved_tasks.jsonl"
200
+ output_path.write_text("")
201
+
202
+ # Semaphore to limit concurrency
203
+ max_concurrent = 5
204
+ semaphore = asyncio.Semaphore(max_concurrent)
205
+
206
+ total = len(tasks)
207
+ print(f"Processing {total} tasks with {max_concurrent} concurrent agents...")
208
+
209
+ async def process_and_save(task: dict, idx: int):
210
+ result = await solve_task(
211
+ task["question"], task["difficulty"], idx, total, semaphore
212
+ )
213
+ write_result(output_path, result)
214
+ return result
215
+
216
+ # Create all tasks
217
+ coroutines = [process_and_save(task, i + 1) for i, task in enumerate(tasks)]
218
+
219
+ # Run all concurrently (semaphore limits actual parallelism)
220
+ results = await asyncio.gather(*coroutines, return_exceptions=True)
221
+
222
+ successful = sum(
223
+ 1 for r in results if isinstance(r, dict) and r.get("error") is None
224
+ )
225
+ print(f"\nCompleted: {successful}/{total} successful")
226
+ print(f"Results saved to {output_path}")
227
+
228
+
229
+ if __name__ == "__main__":
230
+ asyncio.run(main())
eval/eval_set.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
eval/generate_rubrics.py CHANGED
@@ -38,19 +38,22 @@ load_dotenv()
38
  # Rubric generation prompt template based on RaR paper
39
 
40
 
41
- PROMPT_TEMPLATE = """You are an expert rubric writer. Your job is to generate a self-contained set of evaluation criteria (rubrics) for judging
42
- how good, helpful and complete an agent's trajectory is to a given user question/request.
43
 
44
  Rubrics can cover aspects of a response such as, but not limited to, factual correctness, helpfulness, completeness, harmlessness, correctness of using Hugging Face best practices (based on HF documentation), depth of
45
  reasoning, contextual relevance and usefulness. Each item must be self-contained – non expert readers should not need to
46
- infer anything or consult external information. Begin each description with its category: Essential Criteria: . . . ”, Important
47
- Criteria: . . . ”, Optional Criteria: . . . ”, or Pitfall Criteria: Does not mention . . . ”.
48
 
49
 
50
- Inputs: !!!
51
  - question: <<<{question}>>>
52
- - reference_answer (ideal solution): <<<{reference_answer}>>>
53
- - thread: <<<{thread}>>>
 
 
 
 
54
 
55
  Total items:
56
  • Choose 7–20 rubric items based on the complexity of the question.
@@ -69,47 +72,97 @@ Category guidance:
69
  • Important: Key reasoning, completeness, or clarity; strongly affects quality and usefulness (weight 3–4).
70
  • Optional: Helpfulness in educating the user or providing extra depth; nice to have but not deal-breaking (weight 1–2).
71
  • Pitfall: Common mistakes or omissions specific to this prompt—identify things a respondent often forgets or misstates.
72
- Each Pitfall description must begin with Pitfall Criteria: Does not mention . . . or Pitfall Criteria: Recommends . . .
73
  and use weight –1 or –2.
74
 
75
  To ensure self-contained guidance:
76
- • When referring to answer choices, explicitly say Identifies (A)”, Identifies (B)”, etc., rather than vague phrasing.
77
  • If the format requires an action like calling a tool or launching a training run, include a rubric item such as:
78
  – Essential Criteria: Includes a clear statement "Launches the training with hf-jobs.".
79
  • If reasoning should precede the answer, include a rubric like:
80
  – Important Criteria: Presents the explanation and reasoning before stating the final answer.
81
  • If brevity is valued, include a rubric like:
82
  – Optional Criteria: Remains concise and avoids unnecessary detail.
83
- • If the question context demands mention of specific findings/best practices, include that explicitly (e.g., Essential Criteria: Mentions
84
- that training data must be in "messages" column for LLM training).
85
 
86
  Output: Provide a JSON array of rubric objects. Each object must contain exactly three keys—title, description, and weight.
87
- Do not copy large blocks of the question or reference_answer into the text. Each description must begin with its category
88
  prefix, and no extra keys are allowed.
89
- Now, given the question, thread and reference_answer, generate the rubric as described. The reference answer is an good and helpful response
90
- but not necessarily exhaustive; use it only as guidance."""
 
91
 
92
 
93
  def build_prompt(
94
- question: str, reference_answer: str, thread: List[Dict[str, str]]
 
 
95
  ) -> List[Dict[str, str]]:
96
  """
97
  Build the messages list for LiteLLM completion.
98
 
99
  Args:
100
  question: The question/task to evaluate
101
- reference_answer: The reference/accepted solution
 
 
102
 
103
  Returns:
104
  List of message dicts for LiteLLM
105
  """
 
 
 
106
  prompt = PROMPT_TEMPLATE.format(
107
- question=question, reference_answer=reference_answer, thread=thread
 
 
108
  )
109
 
110
  return [{"role": "user", "content": prompt}]
111
 
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  def validate_rubric(rubric_list: List[Dict[str, Any]]) -> bool:
114
  """
115
  Validate that rubric meets basic requirements.
@@ -151,8 +204,7 @@ def generate_rubric(row: pd.Series, model: str, timeout: int = 120) -> Dict[str,
151
  Generate rubric for a single question using LiteLLM.
152
 
153
  Args:
154
- question: The question text
155
- reference_answer: The reference solution
156
  model: Model name for LiteLLM
157
  timeout: Request timeout in seconds
158
 
@@ -160,7 +212,11 @@ def generate_rubric(row: pd.Series, model: str, timeout: int = 120) -> Dict[str,
160
  Dict with rubric_list and rubric_count, or None on failure
161
  """
162
 
163
- messages = build_prompt(row["question"], row["solution"], row["thread"])
 
 
 
 
164
 
165
  try:
166
  response = litellm.completion(
@@ -206,17 +262,19 @@ def load_input_data(infile: str) -> pd.DataFrame:
206
 
207
  # Validate required columns
208
  required_cols = [
209
- "discussion_title",
210
- "discussion_url",
211
  "question",
212
- "thread",
213
  "solution",
214
  ]
 
215
  missing_cols = [col for col in required_cols if col not in df.columns]
216
 
217
  if missing_cols:
218
  raise ValueError(f"Missing required columns: {missing_cols}")
219
 
 
 
 
 
220
  return df
221
 
222
 
@@ -307,6 +365,7 @@ def main():
307
 
308
  # Merge with original data
309
  output_row = row.to_dict()
 
310
  output_row["rubric"] = rubric_result
311
  output_rows.append(output_row)
312
  success_count += 1
 
38
  # Rubric generation prompt template based on RaR paper
39
 
40
 
41
+ PROMPT_TEMPLATE = """You are an expert rubric writer. Your job is to generate a self-contained set of evaluation criteria ("rubrics") for judging how good, helpful and complete an agent's trajectory is to a given user question/request.
 
42
 
43
  Rubrics can cover aspects of a response such as, but not limited to, factual correctness, helpfulness, completeness, harmlessness, correctness of using Hugging Face best practices (based on HF documentation), depth of
44
  reasoning, contextual relevance and usefulness. Each item must be self-contained – non expert readers should not need to
45
+ infer anything or consult external information. Begin each description with its category: "Essential Criteria: . . . ", "Important
46
+ Criteria: . . . ", "Optional Criteria: . . . ", or "Pitfall Criteria: Does not mention . . . ".
47
 
48
 
49
+ Inputs:
50
  - question: <<<{question}>>>
51
+ - example_solution (NOT ground truth - just an okay attempt): <<<{example_solution}>>>
52
+ - example_trace (NOT ground truth - just an okay attempt showing what tool usage might look like): <<<{example_trace}>>>
53
+
54
+ IMPORTANT: The example_solution and example_trace provided are NOT ground truth or ideal solutions. They represent
55
+ an attempt at solving the task - they give you a general idea of the shape of the problem and what tool usage
56
+ might look like, but they contain mistakes and incomplete solutions, suboptimal approaches, or incomplete answers. Your rubrics MUST be designed to fairly grade a PERFECT solution. The perfect solution is complete in all aspects of solving the task and verifing it's correctness before giving the final answer. It tells the user what was done and why, and provides the final answer clearly answering the user's question.
57
 
58
  Total items:
59
  • Choose 7–20 rubric items based on the complexity of the question.
 
72
  • Important: Key reasoning, completeness, or clarity; strongly affects quality and usefulness (weight 3–4).
73
  • Optional: Helpfulness in educating the user or providing extra depth; nice to have but not deal-breaking (weight 1–2).
74
  • Pitfall: Common mistakes or omissions specific to this prompt—identify things a respondent often forgets or misstates.
75
+ Each Pitfall description must begin with "Pitfall Criteria: Does not mention . . . " or "Pitfall Criteria: Recommends . . . "
76
  and use weight –1 or –2.
77
 
78
  To ensure self-contained guidance:
79
+ • When referring to answer choices, explicitly say "Identifies (A)", "Identifies (B)", etc., rather than vague phrasing.
80
  • If the format requires an action like calling a tool or launching a training run, include a rubric item such as:
81
  – Essential Criteria: Includes a clear statement "Launches the training with hf-jobs.".
82
  • If reasoning should precede the answer, include a rubric like:
83
  – Important Criteria: Presents the explanation and reasoning before stating the final answer.
84
  • If brevity is valued, include a rubric like:
85
  – Optional Criteria: Remains concise and avoids unnecessary detail.
86
+ • If the question context demands mention of specific findings/best practices, include that explicitly (e.g., "Essential Criteria: Mentions
87
+ that training data must be in "messages" column for LLM training").
88
 
89
  Output: Provide a JSON array of rubric objects. Each object must contain exactly three keys—title, description, and weight.
90
+ Do not copy large blocks of the question or example_solution into the text. Each description must begin with its category
91
  prefix, and no extra keys are allowed.
92
+
93
+ Remember: The example_solution and example_trace are NOT ideal answers - they are just rough attempts to show the
94
+ general approach. Design rubrics that can fairly evaluate any solution, including ones that are better than the example."""
95
 
96
 
97
  def build_prompt(
98
+ question: str,
99
+ example_solution: str,
100
+ example_trace: List[Dict[str, Any]],
101
  ) -> List[Dict[str, str]]:
102
  """
103
  Build the messages list for LiteLLM completion.
104
 
105
  Args:
106
  question: The question/task to evaluate
107
+ difficulty: The difficulty level of the task
108
+ example_solution: An example solution attempt (not ground truth)
109
+ example_trace: The agent's message trace showing tool usage
110
 
111
  Returns:
112
  List of message dicts for LiteLLM
113
  """
114
+ # Format the trace for readability - only include key parts
115
+ formatted_trace = format_trace_for_prompt(example_trace)
116
+
117
  prompt = PROMPT_TEMPLATE.format(
118
+ question=question,
119
+ example_solution=example_solution,
120
+ example_trace=formatted_trace,
121
  )
122
 
123
  return [{"role": "user", "content": prompt}]
124
 
125
 
126
+ def format_trace_for_prompt(messages: List[Dict[str, Any]]) -> str:
127
+ """
128
+ Format the agent message trace for inclusion in the prompt.
129
+ Extracts key information while keeping it readable.
130
+ """
131
+ if not messages:
132
+ return "(No trace available)"
133
+
134
+ formatted_parts = []
135
+ for msg in messages:
136
+ role = msg.get("role", "unknown")
137
+ content = msg.get("content", "")
138
+
139
+ # Skip system messages
140
+ if role == "system":
141
+ continue
142
+
143
+ # Handle tool calls
144
+ if "tool_calls" in msg and msg["tool_calls"]:
145
+ tool_info = []
146
+ for tc in msg["tool_calls"]:
147
+ if isinstance(tc, dict) and "function" in tc:
148
+ func = tc["function"]
149
+ tool_name = func.get("name", "unknown_tool")
150
+ tool_info.append(f" - Called: {tool_name}")
151
+ if tool_info:
152
+ formatted_parts.append(
153
+ "[Assistant Tool Calls]\n" + "\n".join(tool_info)
154
+ )
155
+
156
+ # Handle regular content
157
+ if content:
158
+ # Truncate very long content
159
+ if len(content) > 500:
160
+ content = content[:500] + "... (truncated)"
161
+ formatted_parts.append(f"[{role.title()}]\n{content}")
162
+
163
+ return "\n\n".join(formatted_parts) if formatted_parts else "(Empty trace)"
164
+
165
+
166
  def validate_rubric(rubric_list: List[Dict[str, Any]]) -> bool:
167
  """
168
  Validate that rubric meets basic requirements.
 
204
  Generate rubric for a single question using LiteLLM.
205
 
206
  Args:
207
+ row: DataFrame row containing question, difficulty, solution, and messages
 
208
  model: Model name for LiteLLM
209
  timeout: Request timeout in seconds
210
 
 
212
  Dict with rubric_list and rubric_count, or None on failure
213
  """
214
 
215
+ messages = build_prompt(
216
+ question=row["question"],
217
+ example_solution=row["solution"],
218
+ example_trace=row.get("messages", []),
219
+ )
220
 
221
  try:
222
  response = litellm.completion(
 
262
 
263
  # Validate required columns
264
  required_cols = [
 
 
265
  "question",
 
266
  "solution",
267
  ]
268
+ optional_cols = ["difficulty", "messages", "error"]
269
  missing_cols = [col for col in required_cols if col not in df.columns]
270
 
271
  if missing_cols:
272
  raise ValueError(f"Missing required columns: {missing_cols}")
273
 
274
+ # Log available optional columns
275
+ available_optional = [col for col in optional_cols if col in df.columns]
276
+ print(f"Found optional columns: {available_optional}")
277
+
278
  return df
279
 
280
 
 
365
 
366
  # Merge with original data
367
  output_row = row.to_dict()
368
+ output_row["messages"] = json.dumps(output_row["messages"])
369
  output_row["rubric"] = rubric_result
370
  output_rows.append(output_row)
371
  success_count += 1