Spaces:
Running
Running
Commit
·
2b6a536
0
Parent(s):
thinking if we want eval or not
Browse files- .gitignore +13 -0
- .python-version +1 -0
- README.md +0 -0
- eval/df.csv +0 -0
- eval/df.ipynb +137 -0
- eval/evaluate.py +202 -0
- eval/models.py +57 -0
- eval/qa_pairs.jsonl +0 -0
- eval/scrape_discussions/discussions_scraper.py +98 -0
- pyproject.toml +13 -0
- uv.lock +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python-generated files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[oc]
|
| 4 |
+
build/
|
| 5 |
+
dist/
|
| 6 |
+
wheels/
|
| 7 |
+
*.egg-info
|
| 8 |
+
|
| 9 |
+
# Virtual environments
|
| 10 |
+
.venv
|
| 11 |
+
.env
|
| 12 |
+
.DS_Store
|
| 13 |
+
.claude/
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.12
|
README.md
ADDED
|
File without changes
|
eval/df.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval/df.ipynb
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "b7f67653",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"data": {
|
| 11 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 12 |
+
"model_id": "1fcf9d61b3664bc99d616101c201aca8",
|
| 13 |
+
"version_major": 2,
|
| 14 |
+
"version_minor": 0
|
| 15 |
+
},
|
| 16 |
+
"text/plain": [
|
| 17 |
+
"Generating train split: 0 examples [00:00, ? examples/s]"
|
| 18 |
+
]
|
| 19 |
+
},
|
| 20 |
+
"metadata": {},
|
| 21 |
+
"output_type": "display_data"
|
| 22 |
+
}
|
| 23 |
+
],
|
| 24 |
+
"source": [
|
| 25 |
+
"from datasets import load_dataset\n",
|
| 26 |
+
"\n",
|
| 27 |
+
"ds = load_dataset(\"json\", data_files=\"qa_pairs.jsonl\", split=\"train\")\n"
|
| 28 |
+
]
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"cell_type": "code",
|
| 32 |
+
"execution_count": 2,
|
| 33 |
+
"id": "55cd7b9c",
|
| 34 |
+
"metadata": {},
|
| 35 |
+
"outputs": [
|
| 36 |
+
{
|
| 37 |
+
"data": {
|
| 38 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 39 |
+
"model_id": "cb9452a5789b4b20bd0b01cce111f961",
|
| 40 |
+
"version_major": 2,
|
| 41 |
+
"version_minor": 0
|
| 42 |
+
},
|
| 43 |
+
"text/plain": [
|
| 44 |
+
"Uploading the dataset shards: 0%| | 0/1 [00:00<?, ? shards/s]"
|
| 45 |
+
]
|
| 46 |
+
},
|
| 47 |
+
"metadata": {},
|
| 48 |
+
"output_type": "display_data"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"data": {
|
| 52 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 53 |
+
"model_id": "4a77e027bda5405991bfde524347013c",
|
| 54 |
+
"version_major": 2,
|
| 55 |
+
"version_minor": 0
|
| 56 |
+
},
|
| 57 |
+
"text/plain": [
|
| 58 |
+
"Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
"metadata": {},
|
| 62 |
+
"output_type": "display_data"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"data": {
|
| 66 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 67 |
+
"model_id": "4d70f759da55470fba30326d99d6ac1f",
|
| 68 |
+
"version_major": 2,
|
| 69 |
+
"version_minor": 0
|
| 70 |
+
},
|
| 71 |
+
"text/plain": [
|
| 72 |
+
"Processing Files (0 / 0): | | 0.00B / 0.00B "
|
| 73 |
+
]
|
| 74 |
+
},
|
| 75 |
+
"metadata": {},
|
| 76 |
+
"output_type": "display_data"
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"data": {
|
| 80 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 81 |
+
"model_id": "617db8df8ac94a2889d8760fc7f0113a",
|
| 82 |
+
"version_major": 2,
|
| 83 |
+
"version_minor": 0
|
| 84 |
+
},
|
| 85 |
+
"text/plain": [
|
| 86 |
+
"New Data Upload: | | 0.00B / 0.00B "
|
| 87 |
+
]
|
| 88 |
+
},
|
| 89 |
+
"metadata": {},
|
| 90 |
+
"output_type": "display_data"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"data": {
|
| 94 |
+
"text/plain": [
|
| 95 |
+
"CommitInfo(commit_url='https://huggingface.co/datasets/akseljoonas/qa_pairs/commit/6947117631cb56686c192533427bb4400382b4fd', commit_message='Upload dataset', commit_description='', oid='6947117631cb56686c192533427bb4400382b4fd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/akseljoonas/qa_pairs', endpoint='https://huggingface.co', repo_type='dataset', repo_id='akseljoonas/qa_pairs'), pr_revision=None, pr_num=None)"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
"execution_count": 2,
|
| 99 |
+
"metadata": {},
|
| 100 |
+
"output_type": "execute_result"
|
| 101 |
+
}
|
| 102 |
+
],
|
| 103 |
+
"source": [
|
| 104 |
+
"ds.push_to_hub(\"akseljoonas/qa_pairs\")"
|
| 105 |
+
]
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"cell_type": "code",
|
| 109 |
+
"execution_count": null,
|
| 110 |
+
"id": "16d4760d",
|
| 111 |
+
"metadata": {},
|
| 112 |
+
"outputs": [],
|
| 113 |
+
"source": []
|
| 114 |
+
}
|
| 115 |
+
],
|
| 116 |
+
"metadata": {
|
| 117 |
+
"kernelspec": {
|
| 118 |
+
"display_name": ".venv",
|
| 119 |
+
"language": "python",
|
| 120 |
+
"name": "python3"
|
| 121 |
+
},
|
| 122 |
+
"language_info": {
|
| 123 |
+
"codemirror_mode": {
|
| 124 |
+
"name": "ipython",
|
| 125 |
+
"version": 3
|
| 126 |
+
},
|
| 127 |
+
"file_extension": ".py",
|
| 128 |
+
"mimetype": "text/x-python",
|
| 129 |
+
"name": "python",
|
| 130 |
+
"nbconvert_exporter": "python",
|
| 131 |
+
"pygments_lexer": "ipython3",
|
| 132 |
+
"version": "3.12.11"
|
| 133 |
+
}
|
| 134 |
+
},
|
| 135 |
+
"nbformat": 4,
|
| 136 |
+
"nbformat_minor": 5
|
| 137 |
+
}
|
eval/evaluate.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
from typing import Any, Dict
|
| 4 |
+
|
| 5 |
+
import litellm
|
| 6 |
+
from models import Correctness, JudgementResult
|
| 7 |
+
|
| 8 |
+
# from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_judge_results.py#L16-L33
|
| 9 |
+
GRADER_TEMPLATE = """
|
| 10 |
+
Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
|
| 11 |
+
|
| 12 |
+
[question]: {question}
|
| 13 |
+
|
| 14 |
+
[response]: {response}
|
| 15 |
+
|
| 16 |
+
Your judgement must be in the format and criteria specified below:
|
| 17 |
+
|
| 18 |
+
extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.
|
| 19 |
+
|
| 20 |
+
[correct_answer]: {correct_answer}
|
| 21 |
+
|
| 22 |
+
reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.
|
| 23 |
+
|
| 24 |
+
correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
confidence: The extracted confidence score between 0|%| and 100|%| from [response]. Put 100 if there is no confidence score available.
|
| 28 |
+
""".strip()
|
| 29 |
+
|
| 30 |
+
CHOICE_STRINGS = ["yes", "no"]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
async def evaluate_single_response(
|
| 34 |
+
question: str,
|
| 35 |
+
response: str,
|
| 36 |
+
correct_answer: str,
|
| 37 |
+
model: str = "gpt-4o-mini",
|
| 38 |
+
semaphore: asyncio.Semaphore = None,
|
| 39 |
+
) -> Dict[str, Any]:
|
| 40 |
+
"""
|
| 41 |
+
Evaluate a single response against the ground truth using LLM as judge.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
question: The question being answered
|
| 45 |
+
response: The response to evaluate
|
| 46 |
+
correct_answer: The ground truth answer
|
| 47 |
+
model: The LLM model to use for judging
|
| 48 |
+
semaphore: Semaphore for rate limiting
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
Dictionary containing the judgement result and metadata
|
| 52 |
+
"""
|
| 53 |
+
if semaphore:
|
| 54 |
+
async with semaphore:
|
| 55 |
+
return await _evaluate_single_response_impl(
|
| 56 |
+
question, response, correct_answer, model
|
| 57 |
+
)
|
| 58 |
+
else:
|
| 59 |
+
return await _evaluate_single_response_impl(
|
| 60 |
+
question, response, correct_answer, model
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
async def _evaluate_single_response_impl(
|
| 65 |
+
question: str, response: str, correct_answer: str, model: str
|
| 66 |
+
) -> Dict[str, Any]:
|
| 67 |
+
"""Internal implementation of single response evaluation"""
|
| 68 |
+
|
| 69 |
+
prompt = GRADER_TEMPLATE.format(
|
| 70 |
+
question=question, response=response, correct_answer=correct_answer
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# Use litellm with structured output
|
| 74 |
+
response = await litellm.acompletion(
|
| 75 |
+
model=model,
|
| 76 |
+
messages=[
|
| 77 |
+
{
|
| 78 |
+
"role": "system",
|
| 79 |
+
"content": "You are an expert judge evaluating answers for accuracy and equivalence.",
|
| 80 |
+
},
|
| 81 |
+
{"role": "user", "content": prompt},
|
| 82 |
+
],
|
| 83 |
+
response_format=JudgementResult,
|
| 84 |
+
temperature=0.0,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# Parse structured output
|
| 88 |
+
result: JudgementResult = JudgementResult.model_validate_json(
|
| 89 |
+
response.choices[0].message.content
|
| 90 |
+
)
|
| 91 |
+
return result
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
async def evaluate_dataset(
|
| 95 |
+
input_file: str,
|
| 96 |
+
eval_file: str,
|
| 97 |
+
output_file: str = "evaluation_results.jsonl",
|
| 98 |
+
model: str = "gpt-4o-mini",
|
| 99 |
+
max_concurrent: int = 30,
|
| 100 |
+
limit: int = None,
|
| 101 |
+
) -> None:
|
| 102 |
+
"""
|
| 103 |
+
Evaluate all QA pairs in the input file using LLM as judge.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
input_file: Path to input JSONL file with QA pairs
|
| 107 |
+
output_file: Path to output JSONL file for results
|
| 108 |
+
model: The LLM model to use for judging
|
| 109 |
+
max_concurrent: Maximum number of concurrent API calls
|
| 110 |
+
limit: Optional limit on number of examples to evaluate
|
| 111 |
+
"""
|
| 112 |
+
to_evaluate = [json.loads(line) for line in open(input_file, "r")]
|
| 113 |
+
if limit:
|
| 114 |
+
to_evaluate = to_evaluate[:limit]
|
| 115 |
+
|
| 116 |
+
print(f"Loaded {len(to_evaluate)} QA pairs to evaluate")
|
| 117 |
+
|
| 118 |
+
# Load dataset
|
| 119 |
+
print(f"Loading ground truth from {eval_file}...")
|
| 120 |
+
with open(eval_file, "r") as f:
|
| 121 |
+
ground_truths = [json.loads(line) for line in f]
|
| 122 |
+
|
| 123 |
+
print(f"Loaded {len(ground_truths)} ground truths")
|
| 124 |
+
|
| 125 |
+
# Create semaphore for rate limiting
|
| 126 |
+
semaphore = asyncio.Semaphore(max_concurrent)
|
| 127 |
+
|
| 128 |
+
# Create evaluation tasks
|
| 129 |
+
tasks = []
|
| 130 |
+
for qa_pair, ground_truth in zip(to_evaluate, ground_truths):
|
| 131 |
+
question = ground_truth.get("question", "")
|
| 132 |
+
ground_truth = ground_truth.get("solution", "")
|
| 133 |
+
response = qa_pair.get("solution", "")
|
| 134 |
+
|
| 135 |
+
task = evaluate_single_response(
|
| 136 |
+
response=response,
|
| 137 |
+
question=question,
|
| 138 |
+
correct_answer=ground_truth,
|
| 139 |
+
model=model,
|
| 140 |
+
semaphore=semaphore,
|
| 141 |
+
)
|
| 142 |
+
tasks.append(task)
|
| 143 |
+
|
| 144 |
+
# Run evaluations in parallel
|
| 145 |
+
print(f"Running evaluations with {max_concurrent} parallel workers...")
|
| 146 |
+
results = await asyncio.gather(*tasks)
|
| 147 |
+
|
| 148 |
+
# Combine results with original data
|
| 149 |
+
output_data = []
|
| 150 |
+
correct_count = 0
|
| 151 |
+
error_count = 0
|
| 152 |
+
|
| 153 |
+
for qa_pair, result in zip(to_evaluate, results):
|
| 154 |
+
print(result.model_dump_json())
|
| 155 |
+
# output_entry = {**qa_pair, "evaluation": result}
|
| 156 |
+
# output_data.append(output_entry)
|
| 157 |
+
|
| 158 |
+
if result.correct == Correctness.yes:
|
| 159 |
+
correct_count += 1
|
| 160 |
+
else:
|
| 161 |
+
error_count += 1
|
| 162 |
+
|
| 163 |
+
# # Write results
|
| 164 |
+
# print(f"Writing results to {output_file}...")
|
| 165 |
+
# with open(output_file, "w") as f:
|
| 166 |
+
# for entry in output_data:
|
| 167 |
+
# f.write(entry.model_dump_json() + "\n")
|
| 168 |
+
|
| 169 |
+
# Print summary
|
| 170 |
+
total = len(to_evaluate)
|
| 171 |
+
success_rate = (total - error_count) / total * 100 if total > 0 else 0
|
| 172 |
+
accuracy = correct_count / total * 100 if total > 0 else 0
|
| 173 |
+
|
| 174 |
+
print("\n" + "=" * 50)
|
| 175 |
+
print("EVALUATION SUMMARY")
|
| 176 |
+
print("=" * 50)
|
| 177 |
+
print(f"Total examples: {total}")
|
| 178 |
+
print(f"Successful evaluations: {total - error_count}")
|
| 179 |
+
print(f"Errors: {error_count}")
|
| 180 |
+
print(f"Success rate: {success_rate:.2f}%")
|
| 181 |
+
print(f"Correct answers: {correct_count}")
|
| 182 |
+
print(f"Accuracy: {accuracy:.2f}%")
|
| 183 |
+
print("=" * 50)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
#
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
async def main():
|
| 190 |
+
"""Main entry point for the evaluation script"""
|
| 191 |
+
await evaluate_dataset(
|
| 192 |
+
input_file="qa_pairs.jsonl",
|
| 193 |
+
eval_file="qa_pairs.jsonl",
|
| 194 |
+
output_file="evaluation_results.jsonl",
|
| 195 |
+
model="gpt-4o-mini",
|
| 196 |
+
max_concurrent=30,
|
| 197 |
+
limit=100, # Set to None to evaluate all, or a number to limit
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
if __name__ == "__main__":
|
| 202 |
+
asyncio.run(main())
|
eval/models.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared data models for the HF agent project"""
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from enum import Enum
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel, Field
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Discussion(BaseModel):
|
| 10 |
+
"""Model for a discussion thread"""
|
| 11 |
+
|
| 12 |
+
title: str
|
| 13 |
+
url: str
|
| 14 |
+
topic_id: int
|
| 15 |
+
category: int
|
| 16 |
+
created_at: datetime
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class QuestionAndSolution(BaseModel):
|
| 20 |
+
"""Model for a QA pair from a discussion"""
|
| 21 |
+
|
| 22 |
+
discussion_title: str
|
| 23 |
+
discussion_url: str
|
| 24 |
+
discussion_topic_id: int
|
| 25 |
+
discussion_category: int
|
| 26 |
+
discussion_created_at: datetime
|
| 27 |
+
thread: list[dict]
|
| 28 |
+
question: str
|
| 29 |
+
solution: str
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class Correctness(str, Enum):
|
| 33 |
+
yes = "yes"
|
| 34 |
+
no = "no"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class JudgementResult(BaseModel):
|
| 38 |
+
"""Structured output for LLM judge evaluation"""
|
| 39 |
+
|
| 40 |
+
extracted_final_answer: str = Field(
|
| 41 |
+
description="The final exact/snippet answer extracted from the response"
|
| 42 |
+
)
|
| 43 |
+
reasoning: str = Field(
|
| 44 |
+
description="Explanation of why the answer is correct or incorrect"
|
| 45 |
+
)
|
| 46 |
+
correct: Correctness = Field(description="'yes' if correct, 'no' if incorrect")
|
| 47 |
+
confidence: int = Field(
|
| 48 |
+
description="Confidence score between 0 and 100", ge=0, le=100
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class EvaluationResult(BaseModel):
|
| 53 |
+
"""Model for evaluation results including metadata"""
|
| 54 |
+
|
| 55 |
+
success: bool
|
| 56 |
+
judgement: JudgementResult | None = None
|
| 57 |
+
error: str | None = None
|
eval/qa_pairs.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval/scrape_discussions/discussions_scraper.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import time
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
from tenacity import (
|
| 7 |
+
retry,
|
| 8 |
+
retry_if_exception_type,
|
| 9 |
+
stop_after_attempt,
|
| 10 |
+
wait_exponential,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
# Add parent directory to path to import models
|
| 14 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 15 |
+
from models import Discussion, QuestionAndSolution
|
| 16 |
+
|
| 17 |
+
BASE_URL = "https://discuss.huggingface.co"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# configure retry decorator for your requests
|
| 21 |
+
@retry(
|
| 22 |
+
stop=stop_after_attempt(5),
|
| 23 |
+
wait=wait_exponential(multiplier=1, min=1, max=60),
|
| 24 |
+
retry=retry_if_exception_type(requests.HTTPError),
|
| 25 |
+
)
|
| 26 |
+
def safe_get(url, **kwargs):
|
| 27 |
+
resp = requests.get(url, **kwargs)
|
| 28 |
+
if resp.status_code == 422:
|
| 29 |
+
# read retry‐after header if present
|
| 30 |
+
retry_after = resp.headers.get("Retry-After")
|
| 31 |
+
if retry_after:
|
| 32 |
+
delay = float(retry_after)
|
| 33 |
+
else:
|
| 34 |
+
# fallback to guess
|
| 35 |
+
delay = 30
|
| 36 |
+
print(f"429 hit — waiting {delay} seconds...")
|
| 37 |
+
time.sleep(delay)
|
| 38 |
+
resp.raise_for_status()
|
| 39 |
+
else:
|
| 40 |
+
resp.raise_for_status()
|
| 41 |
+
return resp
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def get_solved_discussions(n_posts: int = 50):
|
| 45 |
+
page = 1
|
| 46 |
+
discussions = []
|
| 47 |
+
while len(discussions) < n_posts:
|
| 48 |
+
url = f"{BASE_URL}/search.json?q=status:solved+order:latest&page={page}"
|
| 49 |
+
resp = safe_get(url)
|
| 50 |
+
topics = resp.json()["topics"]
|
| 51 |
+
if not topics:
|
| 52 |
+
break
|
| 53 |
+
for post in topics:
|
| 54 |
+
discussions.append(
|
| 55 |
+
Discussion(
|
| 56 |
+
title=post["fancy_title"],
|
| 57 |
+
url=f"{BASE_URL}/t/{post['slug']}/{post['id']}",
|
| 58 |
+
topic_id=post["id"],
|
| 59 |
+
category=post["category_id"],
|
| 60 |
+
created_at=post["created_at"],
|
| 61 |
+
)
|
| 62 |
+
)
|
| 63 |
+
if len(discussions) >= n_posts:
|
| 64 |
+
break
|
| 65 |
+
page += 1
|
| 66 |
+
time.sleep(0.5) # simple pacing to avoid bursts
|
| 67 |
+
return discussions
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def get_qa_pair(discussions, start_idx: int = 0):
|
| 71 |
+
for discussion in discussions[start_idx:]:
|
| 72 |
+
resp = safe_get(discussion.url + ".json")
|
| 73 |
+
data = resp.json()
|
| 74 |
+
posts = data["post_stream"]["posts"]
|
| 75 |
+
accepted_nr = min(
|
| 76 |
+
max(data["accepted_answer"]["post_number"] - 1, 0), len(posts) - 1
|
| 77 |
+
)
|
| 78 |
+
question = posts[0]["cooked"]
|
| 79 |
+
solution = posts[accepted_nr]["cooked"]
|
| 80 |
+
yield QuestionAndSolution(
|
| 81 |
+
discussion_title=discussion.title,
|
| 82 |
+
discussion_url=discussion.url,
|
| 83 |
+
discussion_topic_id=discussion.topic_id,
|
| 84 |
+
discussion_category=discussion.category,
|
| 85 |
+
discussion_created_at=discussion.created_at,
|
| 86 |
+
question=question,
|
| 87 |
+
solution=solution,
|
| 88 |
+
thread=posts,
|
| 89 |
+
)
|
| 90 |
+
time.sleep(0.5)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
discussions = get_solved_discussions(n_posts=300)
|
| 95 |
+
print(f"Fetched {len(discussions)} discussions")
|
| 96 |
+
with open("qa_pairs.jsonl", "a") as f:
|
| 97 |
+
for qa_pair in get_qa_pair(discussions):
|
| 98 |
+
f.write(qa_pair.model_dump_json() + "\n")
|
pyproject.toml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "hf-agent"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.12"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"numpy>=1.24.0",
|
| 9 |
+
"requests>=2.32.5",
|
| 10 |
+
"pydantic>=2.12.3",
|
| 11 |
+
"litellm>=1.0.0",
|
| 12 |
+
"tenacity>=8.0.0",
|
| 13 |
+
]
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|