akseljoonas HF Staff commited on
Commit
2b6a536
·
0 Parent(s):

thinking if we want eval or not

Browse files
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+ .env
12
+ .DS_Store
13
+ .claude/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
README.md ADDED
File without changes
eval/df.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval/df.ipynb ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "b7f67653",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "data": {
11
+ "application/vnd.jupyter.widget-view+json": {
12
+ "model_id": "1fcf9d61b3664bc99d616101c201aca8",
13
+ "version_major": 2,
14
+ "version_minor": 0
15
+ },
16
+ "text/plain": [
17
+ "Generating train split: 0 examples [00:00, ? examples/s]"
18
+ ]
19
+ },
20
+ "metadata": {},
21
+ "output_type": "display_data"
22
+ }
23
+ ],
24
+ "source": [
25
+ "from datasets import load_dataset\n",
26
+ "\n",
27
+ "ds = load_dataset(\"json\", data_files=\"qa_pairs.jsonl\", split=\"train\")\n"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 2,
33
+ "id": "55cd7b9c",
34
+ "metadata": {},
35
+ "outputs": [
36
+ {
37
+ "data": {
38
+ "application/vnd.jupyter.widget-view+json": {
39
+ "model_id": "cb9452a5789b4b20bd0b01cce111f961",
40
+ "version_major": 2,
41
+ "version_minor": 0
42
+ },
43
+ "text/plain": [
44
+ "Uploading the dataset shards: 0%| | 0/1 [00:00<?, ? shards/s]"
45
+ ]
46
+ },
47
+ "metadata": {},
48
+ "output_type": "display_data"
49
+ },
50
+ {
51
+ "data": {
52
+ "application/vnd.jupyter.widget-view+json": {
53
+ "model_id": "4a77e027bda5405991bfde524347013c",
54
+ "version_major": 2,
55
+ "version_minor": 0
56
+ },
57
+ "text/plain": [
58
+ "Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
59
+ ]
60
+ },
61
+ "metadata": {},
62
+ "output_type": "display_data"
63
+ },
64
+ {
65
+ "data": {
66
+ "application/vnd.jupyter.widget-view+json": {
67
+ "model_id": "4d70f759da55470fba30326d99d6ac1f",
68
+ "version_major": 2,
69
+ "version_minor": 0
70
+ },
71
+ "text/plain": [
72
+ "Processing Files (0 / 0): | | 0.00B / 0.00B "
73
+ ]
74
+ },
75
+ "metadata": {},
76
+ "output_type": "display_data"
77
+ },
78
+ {
79
+ "data": {
80
+ "application/vnd.jupyter.widget-view+json": {
81
+ "model_id": "617db8df8ac94a2889d8760fc7f0113a",
82
+ "version_major": 2,
83
+ "version_minor": 0
84
+ },
85
+ "text/plain": [
86
+ "New Data Upload: | | 0.00B / 0.00B "
87
+ ]
88
+ },
89
+ "metadata": {},
90
+ "output_type": "display_data"
91
+ },
92
+ {
93
+ "data": {
94
+ "text/plain": [
95
+ "CommitInfo(commit_url='https://huggingface.co/datasets/akseljoonas/qa_pairs/commit/6947117631cb56686c192533427bb4400382b4fd', commit_message='Upload dataset', commit_description='', oid='6947117631cb56686c192533427bb4400382b4fd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/akseljoonas/qa_pairs', endpoint='https://huggingface.co', repo_type='dataset', repo_id='akseljoonas/qa_pairs'), pr_revision=None, pr_num=None)"
96
+ ]
97
+ },
98
+ "execution_count": 2,
99
+ "metadata": {},
100
+ "output_type": "execute_result"
101
+ }
102
+ ],
103
+ "source": [
104
+ "ds.push_to_hub(\"akseljoonas/qa_pairs\")"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": null,
110
+ "id": "16d4760d",
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": []
114
+ }
115
+ ],
116
+ "metadata": {
117
+ "kernelspec": {
118
+ "display_name": ".venv",
119
+ "language": "python",
120
+ "name": "python3"
121
+ },
122
+ "language_info": {
123
+ "codemirror_mode": {
124
+ "name": "ipython",
125
+ "version": 3
126
+ },
127
+ "file_extension": ".py",
128
+ "mimetype": "text/x-python",
129
+ "name": "python",
130
+ "nbconvert_exporter": "python",
131
+ "pygments_lexer": "ipython3",
132
+ "version": "3.12.11"
133
+ }
134
+ },
135
+ "nbformat": 4,
136
+ "nbformat_minor": 5
137
+ }
eval/evaluate.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ from typing import Any, Dict
4
+
5
+ import litellm
6
+ from models import Correctness, JudgementResult
7
+
8
+ # from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_judge_results.py#L16-L33
9
+ GRADER_TEMPLATE = """
10
+ Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
11
+
12
+ [question]: {question}
13
+
14
+ [response]: {response}
15
+
16
+ Your judgement must be in the format and criteria specified below:
17
+
18
+ extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.
19
+
20
+ [correct_answer]: {correct_answer}
21
+
22
+ reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.
23
+
24
+ correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
25
+
26
+
27
+ confidence: The extracted confidence score between 0|%| and 100|%| from [response]. Put 100 if there is no confidence score available.
28
+ """.strip()
29
+
30
+ CHOICE_STRINGS = ["yes", "no"]
31
+
32
+
33
+ async def evaluate_single_response(
34
+ question: str,
35
+ response: str,
36
+ correct_answer: str,
37
+ model: str = "gpt-4o-mini",
38
+ semaphore: asyncio.Semaphore = None,
39
+ ) -> Dict[str, Any]:
40
+ """
41
+ Evaluate a single response against the ground truth using LLM as judge.
42
+
43
+ Args:
44
+ question: The question being answered
45
+ response: The response to evaluate
46
+ correct_answer: The ground truth answer
47
+ model: The LLM model to use for judging
48
+ semaphore: Semaphore for rate limiting
49
+
50
+ Returns:
51
+ Dictionary containing the judgement result and metadata
52
+ """
53
+ if semaphore:
54
+ async with semaphore:
55
+ return await _evaluate_single_response_impl(
56
+ question, response, correct_answer, model
57
+ )
58
+ else:
59
+ return await _evaluate_single_response_impl(
60
+ question, response, correct_answer, model
61
+ )
62
+
63
+
64
+ async def _evaluate_single_response_impl(
65
+ question: str, response: str, correct_answer: str, model: str
66
+ ) -> Dict[str, Any]:
67
+ """Internal implementation of single response evaluation"""
68
+
69
+ prompt = GRADER_TEMPLATE.format(
70
+ question=question, response=response, correct_answer=correct_answer
71
+ )
72
+
73
+ # Use litellm with structured output
74
+ response = await litellm.acompletion(
75
+ model=model,
76
+ messages=[
77
+ {
78
+ "role": "system",
79
+ "content": "You are an expert judge evaluating answers for accuracy and equivalence.",
80
+ },
81
+ {"role": "user", "content": prompt},
82
+ ],
83
+ response_format=JudgementResult,
84
+ temperature=0.0,
85
+ )
86
+
87
+ # Parse structured output
88
+ result: JudgementResult = JudgementResult.model_validate_json(
89
+ response.choices[0].message.content
90
+ )
91
+ return result
92
+
93
+
94
+ async def evaluate_dataset(
95
+ input_file: str,
96
+ eval_file: str,
97
+ output_file: str = "evaluation_results.jsonl",
98
+ model: str = "gpt-4o-mini",
99
+ max_concurrent: int = 30,
100
+ limit: int = None,
101
+ ) -> None:
102
+ """
103
+ Evaluate all QA pairs in the input file using LLM as judge.
104
+
105
+ Args:
106
+ input_file: Path to input JSONL file with QA pairs
107
+ output_file: Path to output JSONL file for results
108
+ model: The LLM model to use for judging
109
+ max_concurrent: Maximum number of concurrent API calls
110
+ limit: Optional limit on number of examples to evaluate
111
+ """
112
+ to_evaluate = [json.loads(line) for line in open(input_file, "r")]
113
+ if limit:
114
+ to_evaluate = to_evaluate[:limit]
115
+
116
+ print(f"Loaded {len(to_evaluate)} QA pairs to evaluate")
117
+
118
+ # Load dataset
119
+ print(f"Loading ground truth from {eval_file}...")
120
+ with open(eval_file, "r") as f:
121
+ ground_truths = [json.loads(line) for line in f]
122
+
123
+ print(f"Loaded {len(ground_truths)} ground truths")
124
+
125
+ # Create semaphore for rate limiting
126
+ semaphore = asyncio.Semaphore(max_concurrent)
127
+
128
+ # Create evaluation tasks
129
+ tasks = []
130
+ for qa_pair, ground_truth in zip(to_evaluate, ground_truths):
131
+ question = ground_truth.get("question", "")
132
+ ground_truth = ground_truth.get("solution", "")
133
+ response = qa_pair.get("solution", "")
134
+
135
+ task = evaluate_single_response(
136
+ response=response,
137
+ question=question,
138
+ correct_answer=ground_truth,
139
+ model=model,
140
+ semaphore=semaphore,
141
+ )
142
+ tasks.append(task)
143
+
144
+ # Run evaluations in parallel
145
+ print(f"Running evaluations with {max_concurrent} parallel workers...")
146
+ results = await asyncio.gather(*tasks)
147
+
148
+ # Combine results with original data
149
+ output_data = []
150
+ correct_count = 0
151
+ error_count = 0
152
+
153
+ for qa_pair, result in zip(to_evaluate, results):
154
+ print(result.model_dump_json())
155
+ # output_entry = {**qa_pair, "evaluation": result}
156
+ # output_data.append(output_entry)
157
+
158
+ if result.correct == Correctness.yes:
159
+ correct_count += 1
160
+ else:
161
+ error_count += 1
162
+
163
+ # # Write results
164
+ # print(f"Writing results to {output_file}...")
165
+ # with open(output_file, "w") as f:
166
+ # for entry in output_data:
167
+ # f.write(entry.model_dump_json() + "\n")
168
+
169
+ # Print summary
170
+ total = len(to_evaluate)
171
+ success_rate = (total - error_count) / total * 100 if total > 0 else 0
172
+ accuracy = correct_count / total * 100 if total > 0 else 0
173
+
174
+ print("\n" + "=" * 50)
175
+ print("EVALUATION SUMMARY")
176
+ print("=" * 50)
177
+ print(f"Total examples: {total}")
178
+ print(f"Successful evaluations: {total - error_count}")
179
+ print(f"Errors: {error_count}")
180
+ print(f"Success rate: {success_rate:.2f}%")
181
+ print(f"Correct answers: {correct_count}")
182
+ print(f"Accuracy: {accuracy:.2f}%")
183
+ print("=" * 50)
184
+
185
+
186
+ #
187
+
188
+
189
+ async def main():
190
+ """Main entry point for the evaluation script"""
191
+ await evaluate_dataset(
192
+ input_file="qa_pairs.jsonl",
193
+ eval_file="qa_pairs.jsonl",
194
+ output_file="evaluation_results.jsonl",
195
+ model="gpt-4o-mini",
196
+ max_concurrent=30,
197
+ limit=100, # Set to None to evaluate all, or a number to limit
198
+ )
199
+
200
+
201
+ if __name__ == "__main__":
202
+ asyncio.run(main())
eval/models.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared data models for the HF agent project"""
2
+
3
+ from datetime import datetime
4
+ from enum import Enum
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ class Discussion(BaseModel):
10
+ """Model for a discussion thread"""
11
+
12
+ title: str
13
+ url: str
14
+ topic_id: int
15
+ category: int
16
+ created_at: datetime
17
+
18
+
19
+ class QuestionAndSolution(BaseModel):
20
+ """Model for a QA pair from a discussion"""
21
+
22
+ discussion_title: str
23
+ discussion_url: str
24
+ discussion_topic_id: int
25
+ discussion_category: int
26
+ discussion_created_at: datetime
27
+ thread: list[dict]
28
+ question: str
29
+ solution: str
30
+
31
+
32
+ class Correctness(str, Enum):
33
+ yes = "yes"
34
+ no = "no"
35
+
36
+
37
+ class JudgementResult(BaseModel):
38
+ """Structured output for LLM judge evaluation"""
39
+
40
+ extracted_final_answer: str = Field(
41
+ description="The final exact/snippet answer extracted from the response"
42
+ )
43
+ reasoning: str = Field(
44
+ description="Explanation of why the answer is correct or incorrect"
45
+ )
46
+ correct: Correctness = Field(description="'yes' if correct, 'no' if incorrect")
47
+ confidence: int = Field(
48
+ description="Confidence score between 0 and 100", ge=0, le=100
49
+ )
50
+
51
+
52
+ class EvaluationResult(BaseModel):
53
+ """Model for evaluation results including metadata"""
54
+
55
+ success: bool
56
+ judgement: JudgementResult | None = None
57
+ error: str | None = None
eval/qa_pairs.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval/scrape_discussions/discussions_scraper.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+ from pathlib import Path
4
+
5
+ import requests
6
+ from tenacity import (
7
+ retry,
8
+ retry_if_exception_type,
9
+ stop_after_attempt,
10
+ wait_exponential,
11
+ )
12
+
13
+ # Add parent directory to path to import models
14
+ sys.path.insert(0, str(Path(__file__).parent.parent))
15
+ from models import Discussion, QuestionAndSolution
16
+
17
+ BASE_URL = "https://discuss.huggingface.co"
18
+
19
+
20
+ # configure retry decorator for your requests
21
+ @retry(
22
+ stop=stop_after_attempt(5),
23
+ wait=wait_exponential(multiplier=1, min=1, max=60),
24
+ retry=retry_if_exception_type(requests.HTTPError),
25
+ )
26
+ def safe_get(url, **kwargs):
27
+ resp = requests.get(url, **kwargs)
28
+ if resp.status_code == 422:
29
+ # read retry‐after header if present
30
+ retry_after = resp.headers.get("Retry-After")
31
+ if retry_after:
32
+ delay = float(retry_after)
33
+ else:
34
+ # fallback to guess
35
+ delay = 30
36
+ print(f"429 hit — waiting {delay} seconds...")
37
+ time.sleep(delay)
38
+ resp.raise_for_status()
39
+ else:
40
+ resp.raise_for_status()
41
+ return resp
42
+
43
+
44
+ def get_solved_discussions(n_posts: int = 50):
45
+ page = 1
46
+ discussions = []
47
+ while len(discussions) < n_posts:
48
+ url = f"{BASE_URL}/search.json?q=status:solved+order:latest&page={page}"
49
+ resp = safe_get(url)
50
+ topics = resp.json()["topics"]
51
+ if not topics:
52
+ break
53
+ for post in topics:
54
+ discussions.append(
55
+ Discussion(
56
+ title=post["fancy_title"],
57
+ url=f"{BASE_URL}/t/{post['slug']}/{post['id']}",
58
+ topic_id=post["id"],
59
+ category=post["category_id"],
60
+ created_at=post["created_at"],
61
+ )
62
+ )
63
+ if len(discussions) >= n_posts:
64
+ break
65
+ page += 1
66
+ time.sleep(0.5) # simple pacing to avoid bursts
67
+ return discussions
68
+
69
+
70
+ def get_qa_pair(discussions, start_idx: int = 0):
71
+ for discussion in discussions[start_idx:]:
72
+ resp = safe_get(discussion.url + ".json")
73
+ data = resp.json()
74
+ posts = data["post_stream"]["posts"]
75
+ accepted_nr = min(
76
+ max(data["accepted_answer"]["post_number"] - 1, 0), len(posts) - 1
77
+ )
78
+ question = posts[0]["cooked"]
79
+ solution = posts[accepted_nr]["cooked"]
80
+ yield QuestionAndSolution(
81
+ discussion_title=discussion.title,
82
+ discussion_url=discussion.url,
83
+ discussion_topic_id=discussion.topic_id,
84
+ discussion_category=discussion.category,
85
+ discussion_created_at=discussion.created_at,
86
+ question=question,
87
+ solution=solution,
88
+ thread=posts,
89
+ )
90
+ time.sleep(0.5)
91
+
92
+
93
+ if __name__ == "__main__":
94
+ discussions = get_solved_discussions(n_posts=300)
95
+ print(f"Fetched {len(discussions)} discussions")
96
+ with open("qa_pairs.jsonl", "a") as f:
97
+ for qa_pair in get_qa_pair(discussions):
98
+ f.write(qa_pair.model_dump_json() + "\n")
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "hf-agent"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "numpy>=1.24.0",
9
+ "requests>=2.32.5",
10
+ "pydantic>=2.12.3",
11
+ "litellm>=1.0.0",
12
+ "tenacity>=8.0.0",
13
+ ]
uv.lock ADDED
The diff for this file is too large to render. See raw diff