Spaces:

smolagents
/

ml-agent

Running

App Files Files Community

akseljoonas HF Staff commited on Oct 28, 2025

Commit

2b6a536

0 Parent(s):

thinking if we want eval or not

Browse files

Files changed (11) hide show

.gitignore +13 -0
.python-version +1 -0
README.md +0 -0
eval/df.csv +0 -0
eval/df.ipynb +137 -0
eval/evaluate.py +202 -0
eval/models.py +57 -0
eval/qa_pairs.jsonl +0 -0
eval/scrape_discussions/discussions_scraper.py +98 -0
pyproject.toml +13 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+.env
+.DS_Store
+.claude/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

README.md ADDED Viewed

File without changes

eval/df.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

eval/df.ipynb ADDED Viewed

	@@ -0,0 +1,137 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b7f67653",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1fcf9d61b3664bc99d616101c201aca8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "ds = load_dataset(\"json\", data_files=\"qa_pairs.jsonl\", split=\"train\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "55cd7b9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cb9452a5789b4b20bd0b01cce111f961",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4a77e027bda5405991bfde524347013c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4d70f759da55470fba30326d99d6ac1f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Processing Files (0 / 0): |          |  0.00B /  0.00B            "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "617db8df8ac94a2889d8760fc7f0113a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "New Data Upload: |          |  0.00B /  0.00B            "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/datasets/akseljoonas/qa_pairs/commit/6947117631cb56686c192533427bb4400382b4fd', commit_message='Upload dataset', commit_description='', oid='6947117631cb56686c192533427bb4400382b4fd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/akseljoonas/qa_pairs', endpoint='https://huggingface.co', repo_type='dataset', repo_id='akseljoonas/qa_pairs'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds.push_to_hub(\"akseljoonas/qa_pairs\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16d4760d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

eval/evaluate.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import asyncio
+import json
+from typing import Any, Dict
+import litellm
+from models import Correctness, JudgementResult
+# from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_judge_results.py#L16-L33
+GRADER_TEMPLATE = """
+Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
+[question]: {question}
+[response]: {response}
+Your judgement must be in the format and criteria specified below:
+extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.
+[correct_answer]: {correct_answer}
+reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.
+correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
+confidence: The extracted confidence score between 0|%| and 100|%| from [response]. Put 100 if there is no confidence score available.
+""".strip()
+CHOICE_STRINGS = ["yes", "no"]
+async def evaluate_single_response(
+    question: str,
+    response: str,
+    correct_answer: str,
+    model: str = "gpt-4o-mini",
+    semaphore: asyncio.Semaphore = None,
+) -> Dict[str, Any]:
+    """
+    Evaluate a single response against the ground truth using LLM as judge.
+    Args:
+        question: The question being answered
+        response: The response to evaluate
+        correct_answer: The ground truth answer
+        model: The LLM model to use for judging
+        semaphore: Semaphore for rate limiting
+    Returns:
+        Dictionary containing the judgement result and metadata
+    """
+    if semaphore:
+        async with semaphore:
+            return await _evaluate_single_response_impl(
+                question, response, correct_answer, model
+            )
+    else:
+        return await _evaluate_single_response_impl(
+            question, response, correct_answer, model
+        )
+async def _evaluate_single_response_impl(
+    question: str, response: str, correct_answer: str, model: str
+) -> Dict[str, Any]:
+    """Internal implementation of single response evaluation"""
+    prompt = GRADER_TEMPLATE.format(
+        question=question, response=response, correct_answer=correct_answer
+    )
+    # Use litellm with structured output
+    response = await litellm.acompletion(
+        model=model,
+        messages=[
+            {
+                "role": "system",
+                "content": "You are an expert judge evaluating answers for accuracy and equivalence.",
+            },
+            {"role": "user", "content": prompt},
+        ],
+        response_format=JudgementResult,
+        temperature=0.0,
+    )
+    # Parse structured output
+    result: JudgementResult = JudgementResult.model_validate_json(
+        response.choices[0].message.content
+    )
+    return result
+async def evaluate_dataset(
+    input_file: str,
+    eval_file: str,
+    output_file: str = "evaluation_results.jsonl",
+    model: str = "gpt-4o-mini",
+    max_concurrent: int = 30,
+    limit: int = None,
+) -> None:
+    """
+    Evaluate all QA pairs in the input file using LLM as judge.
+    Args:
+        input_file: Path to input JSONL file with QA pairs
+        output_file: Path to output JSONL file for results
+        model: The LLM model to use for judging
+        max_concurrent: Maximum number of concurrent API calls
+        limit: Optional limit on number of examples to evaluate
+    """
+    to_evaluate = [json.loads(line) for line in open(input_file, "r")]
+    if limit:
+        to_evaluate = to_evaluate[:limit]
+    print(f"Loaded {len(to_evaluate)} QA pairs to evaluate")
+    # Load dataset
+    print(f"Loading ground truth from {eval_file}...")
+    with open(eval_file, "r") as f:
+        ground_truths = [json.loads(line) for line in f]
+    print(f"Loaded {len(ground_truths)} ground truths")
+    # Create semaphore for rate limiting
+    semaphore = asyncio.Semaphore(max_concurrent)
+    # Create evaluation tasks
+    tasks = []
+    for qa_pair, ground_truth in zip(to_evaluate, ground_truths):
+        question = ground_truth.get("question", "")
+        ground_truth = ground_truth.get("solution", "")
+        response = qa_pair.get("solution", "")
+        task = evaluate_single_response(
+            response=response,
+            question=question,
+            correct_answer=ground_truth,
+            model=model,
+            semaphore=semaphore,
+        )
+        tasks.append(task)
+    # Run evaluations in parallel
+    print(f"Running evaluations with {max_concurrent} parallel workers...")
+    results = await asyncio.gather(*tasks)
+    # Combine results with original data
+    output_data = []
+    correct_count = 0
+    error_count = 0
+    for qa_pair, result in zip(to_evaluate, results):
+        print(result.model_dump_json())
+        # output_entry = {**qa_pair, "evaluation": result}
+        # output_data.append(output_entry)
+        if result.correct == Correctness.yes:
+            correct_count += 1
+        else:
+            error_count += 1
+    # # Write results
+    # print(f"Writing results to {output_file}...")
+    # with open(output_file, "w") as f:
+    #     for entry in output_data:
+    #         f.write(entry.model_dump_json() + "\n")
+    # Print summary
+    total = len(to_evaluate)
+    success_rate = (total - error_count) / total * 100 if total > 0 else 0
+    accuracy = correct_count / total * 100 if total > 0 else 0
+    print("\n" + "=" * 50)
+    print("EVALUATION SUMMARY")
+    print("=" * 50)
+    print(f"Total examples: {total}")
+    print(f"Successful evaluations: {total - error_count}")
+    print(f"Errors: {error_count}")
+    print(f"Success rate: {success_rate:.2f}%")
+    print(f"Correct answers: {correct_count}")
+    print(f"Accuracy: {accuracy:.2f}%")
+    print("=" * 50)
+#
+async def main():
+    """Main entry point for the evaluation script"""
+    await evaluate_dataset(
+        input_file="qa_pairs.jsonl",
+        eval_file="qa_pairs.jsonl",
+        output_file="evaluation_results.jsonl",
+        model="gpt-4o-mini",
+        max_concurrent=30,
+        limit=100,  # Set to None to evaluate all, or a number to limit
+    )
+if __name__ == "__main__":
+    asyncio.run(main())

eval/models.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Shared data models for the HF agent project"""
+from datetime import datetime
+from enum import Enum
+from pydantic import BaseModel, Field
+class Discussion(BaseModel):
+    """Model for a discussion thread"""
+    title: str
+    url: str
+    topic_id: int
+    category: int
+    created_at: datetime
+class QuestionAndSolution(BaseModel):
+    """Model for a QA pair from a discussion"""
+    discussion_title: str
+    discussion_url: str
+    discussion_topic_id: int
+    discussion_category: int
+    discussion_created_at: datetime
+    thread: list[dict]
+    question: str
+    solution: str
+class Correctness(str, Enum):
+    yes = "yes"
+    no = "no"
+class JudgementResult(BaseModel):
+    """Structured output for LLM judge evaluation"""
+    extracted_final_answer: str = Field(
+        description="The final exact/snippet answer extracted from the response"
+    )
+    reasoning: str = Field(
+        description="Explanation of why the answer is correct or incorrect"
+    )
+    correct: Correctness = Field(description="'yes' if correct, 'no' if incorrect")
+    confidence: int = Field(
+        description="Confidence score between 0 and 100", ge=0, le=100
+    )
+class EvaluationResult(BaseModel):
+    """Model for evaluation results including metadata"""
+    success: bool
+    judgement: JudgementResult | None = None
+    error: str | None = None

eval/qa_pairs.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval/scrape_discussions/discussions_scraper.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import sys
+import time
+from pathlib import Path
+import requests
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+# Add parent directory to path to import models
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from models import Discussion, QuestionAndSolution
+BASE_URL = "https://discuss.huggingface.co"
+# configure retry decorator for your requests
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=1, max=60),
+    retry=retry_if_exception_type(requests.HTTPError),
+)
+def safe_get(url, **kwargs):
+    resp = requests.get(url, **kwargs)
+    if resp.status_code == 422:
+        # read retry‐after header if present
+        retry_after = resp.headers.get("Retry-After")
+        if retry_after:
+            delay = float(retry_after)
+        else:
+            # fallback to guess
+            delay = 30
+        print(f"429 hit — waiting {delay} seconds...")
+        time.sleep(delay)
+        resp.raise_for_status()
+    else:
+        resp.raise_for_status()
+    return resp
+def get_solved_discussions(n_posts: int = 50):
+    page = 1
+    discussions = []
+    while len(discussions) < n_posts:
+        url = f"{BASE_URL}/search.json?q=status:solved+order:latest&page={page}"
+        resp = safe_get(url)
+        topics = resp.json()["topics"]
+        if not topics:
+            break
+        for post in topics:
+            discussions.append(
+                Discussion(
+                    title=post["fancy_title"],
+                    url=f"{BASE_URL}/t/{post['slug']}/{post['id']}",
+                    topic_id=post["id"],
+                    category=post["category_id"],
+                    created_at=post["created_at"],
+                )
+            )
+            if len(discussions) >= n_posts:
+                break
+        page += 1
+        time.sleep(0.5)  # simple pacing to avoid bursts
+    return discussions
+def get_qa_pair(discussions, start_idx: int = 0):
+    for discussion in discussions[start_idx:]:
+        resp = safe_get(discussion.url + ".json")
+        data = resp.json()
+        posts = data["post_stream"]["posts"]
+        accepted_nr = min(
+            max(data["accepted_answer"]["post_number"] - 1, 0), len(posts) - 1
+        )
+        question = posts[0]["cooked"]
+        solution = posts[accepted_nr]["cooked"]
+        yield QuestionAndSolution(
+            discussion_title=discussion.title,
+            discussion_url=discussion.url,
+            discussion_topic_id=discussion.topic_id,
+            discussion_category=discussion.category,
+            discussion_created_at=discussion.created_at,
+            question=question,
+            solution=solution,
+            thread=posts,
+        )
+        time.sleep(0.5)
+if __name__ == "__main__":
+    discussions = get_solved_discussions(n_posts=300)
+    print(f"Fetched {len(discussions)} discussions")
+    with open("qa_pairs.jsonl", "a") as f:
+        for qa_pair in get_qa_pair(discussions):
+            f.write(qa_pair.model_dump_json() + "\n")

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[project]
+name = "hf-agent"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "numpy>=1.24.0",
+    "requests>=2.32.5",
+    "pydantic>=2.12.3",
+    "litellm>=1.0.0",
+    "tenacity>=8.0.0",
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff