Spaces:

evaleval
/

eee_validator

Running

App Files Files Community

deepmage121 commited on 10 days ago

Commit

92ea780

1 Parent(s): 82484ad

initial commit, space validation stuff

Browse files

Files changed (14) hide show

.gitignore +18 -0
.gradio/certificate.pem +31 -0
Dockerfile +16 -0
README.md +1 -0
app.py +336 -0
dedup.py +164 -0
eval.schema.json +653 -0
eval_types.py +378 -0
instance_level_eval.schema.json +329 -0
instance_level_types.py +188 -0
pyproject.toml +18 -0
requirement_plan.txt +44 -0
uv.lock +0 -0
validate_data.py +190 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.egg-info/
+dist/
+build/
+.eggs/
+*.egg
+.venv/
+venv/
+env/
+.env
+*.log
+.mypy_cache/
+.pytest_cache/
+.ruff_cache/
+/tmp_data/

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
+WORKDIR /app
+ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
+COPY pyproject.toml uv.lock ./
+RUN uv sync --locked --no-install-project
+COPY . /app
+RUN uv sync --locked
+ENV PATH="/app/.venv/bin:$PATH"
+ENTRYPOINT []
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ emoji: 👀
 colorFrom: red
 colorTo: purple
 sdk: docker
 pinned: false
 ---

 colorFrom: red
 colorTo: purple
 sdk: docker
+app_port: 7860
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,336 @@

+"""EEE Validator — HuggingFace Space webhook handler.
+Listens for PR events on evaleval/EEE_datastore, validates changed data
+files with Pydantic, checks for duplicates, and comments results on the PR.
+"""
+import logging
+import os
+import tempfile
+import threading
+from datetime import datetime, timezone
+from huggingface_hub import HfApi, WebhooksServer, webhook_endpoint
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError
+from dedup import DATASET_REPO_ID, DedupReport, check_duplicates, load_manifest
+from validate_data import FileValidationResult, validate_with_pydantic
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+api = HfApi()
+# ---------------------------------------------------------------------------
+# Changed file discovery
+# ---------------------------------------------------------------------------
+def find_changed_files(pr_num: int) -> list[str]:
+    """Find added/modified .json and .jsonl files by comparing PR tree to main.
+    Falls back to tree comparison since DiscussionWithDetails.diff can be None
+    for dataset repos.
+    """
+    revision = f"refs/pr/{pr_num}"
+    def _list_files(rev: str) -> dict[str, str]:
+        """Return {path: oid} for all files at a given revision."""
+        files = {}
+        for entry in api.list_repo_tree(
+            repo_id=DATASET_REPO_ID,
+            repo_type="dataset",
+            revision=rev,
+            recursive=True,
+        ):
+            if hasattr(entry, "rfilename"):
+                files[entry.rfilename] = getattr(entry, "oid", None)
+        return files
+    pr_files = _list_files(revision)
+    main_files = _list_files("main")
+    changed: list[str] = []
+    for path, oid in pr_files.items():
+        if not path.startswith("data/"):
+            continue
+        if not (path.endswith(".json") or path.endswith(".jsonl")):
+            continue
+        # New file, or existing file with different content
+        if path not in main_files or main_files[path] != oid:
+            changed.append(path)
+    return changed
+# ---------------------------------------------------------------------------
+# File download
+# ---------------------------------------------------------------------------
+def download_pr_files(
+    file_paths: list[str], pr_num: int, tmp_dir: str
+) -> dict[str, str]:
+    """Download files from a PR branch and return map of repo-path -> local-path."""
+    downloaded: dict[str, str] = {}
+    revision = f"refs/pr/{pr_num}"
+    for file_path in file_paths:
+        try:
+            local_path = hf_hub_download(
+                repo_id=DATASET_REPO_ID,
+                filename=file_path,
+                repo_type="dataset",
+                revision=revision,
+                local_dir=tmp_dir,
+            )
+            downloaded[file_path] = local_path
+            logger.info("Downloaded %s -> %s", file_path, local_path)
+        except EntryNotFoundError:
+            logger.warning("File not found in PR: %s", file_path)
+        except Exception:
+            logger.exception("Failed to download %s", file_path)
+    return downloaded
+# ---------------------------------------------------------------------------
+# Validation orchestration
+# ---------------------------------------------------------------------------
+def validate_files(
+    downloaded: dict[str, str],
+) -> list[FileValidationResult]:
+    """Validate all downloaded files and return results."""
+    results: list[FileValidationResult] = []
+    for repo_path, local_path in downloaded.items():
+        if repo_path.endswith(".jsonl"):
+            file_type = "jsonl"
+        else:
+            file_type = "json"
+        result = validate_with_pydantic(local_path, file_type)
+        # Store the repo-relative path for reporting
+        result.file_path = repo_path
+        results.append(result)
+    return results
+# ---------------------------------------------------------------------------
+# Deduplication orchestration
+# ---------------------------------------------------------------------------
+def run_dedup(
+    file_paths: list[str], downloaded: dict[str, str]
+) -> DedupReport:
+    """Load manifest and check all files for duplicates."""
+    manifest = load_manifest(api)
+    # Read file contents as bytes
+    file_contents: dict[str, bytes] = {}
+    for repo_path, local_path in downloaded.items():
+        with open(local_path, "rb") as f:
+            file_contents[repo_path] = f.read()
+    return check_duplicates(file_paths, file_contents, manifest)
+# ---------------------------------------------------------------------------
+# Comment formatting
+# ---------------------------------------------------------------------------
+def format_comment(
+    pr_num: int,
+    validation_results: list[FileValidationResult],
+    dedup_report: DedupReport,
+) -> str:
+    """Format the PR comment as markdown."""
+    now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
+    num_passed = sum(1 for r in validation_results if r.valid)
+    num_failed = sum(1 for r in validation_results if not r.valid)
+    total = len(validation_results)
+    lines = [
+        "## EEE Validation Report",
+        f"**PR:** #{pr_num} | **Run:** {now}",
+        "",
+        "### Validation Results",
+        "| File | Status | Details |",
+        "|------|--------|---------|",
+    ]
+    for r in validation_results:
+        if r.valid:
+            type_label = "EvaluationLog" if r.file_type == "json" else "InstanceLevelEvaluationLog"
+            lines.append(f"| `{r.file_path}` | PASS | Validated as {type_label} |")
+        else:
+            # Show first few errors to avoid extremely long comments
+            error_summary = "; ".join(r.errors[:5])
+            if len(r.errors) > 5:
+                error_summary += f" ... and {len(r.errors) - 5} more error(s)"
+            lines.append(f"| `{r.file_path}` | FAIL | {error_summary} |")
+    # Dedup section
+    lines.append("")
+    lines.append("### Duplicate Check")
+    has_any_dupes = False
+    for dr in dedup_report.results:
+        if dr.exact_duplicate_of:
+            lines.append(
+                f"- **Exact duplicate:** `{dr.file_path}` is identical to "
+                f"existing `{dr.exact_duplicate_of}`"
+            )
+            has_any_dupes = True
+        if dr.near_duplicate_of:
+            lines.append(
+                f"- **Potential near-duplicate:** `{dr.file_path}` shares fingerprint "
+                f"with existing `{dr.near_duplicate_of}` "
+                f"(identical content minus timestamps/UUIDs)"
+            )
+            has_any_dupes = True
+    if not has_any_dupes:
+        lines.append("- No exact or near duplicates found.")
+    # Summary
+    lines.append("")
+    lines.append("### Summary")
+    lines.append(f"{total} file(s) checked: {num_passed} passed, {num_failed} failed")
+    return "\n".join(lines)
+# ---------------------------------------------------------------------------
+# Core validation logic (shared by webhook + startup sweep)
+# ---------------------------------------------------------------------------
+REPORT_HEADER = "## EEE Validation Report"
+def process_pr(pr_num: int) -> dict:
+    """Run full validation + dedup on a PR and post results as a comment."""
+    logger.info("Processing PR #%d", pr_num)
+    # Find changed data files by comparing PR tree to main
+    changed_files = find_changed_files(pr_num)
+    if not changed_files:
+        logger.info("No data files changed in PR #%d", pr_num)
+        return {"status": "skipped", "reason": "no data files changed"}
+    logger.info("Found %d changed data file(s): %s", len(changed_files), changed_files)
+    # Create temp directory for downloads
+    tmp_dir = tempfile.mkdtemp(prefix=f"eee_validate_{pr_num}_")
+    # Download changed files from the PR branch
+    downloaded = download_pr_files(changed_files, pr_num, tmp_dir)
+    if not downloaded:
+        logger.warning("No files could be downloaded for PR #%d", pr_num)
+        return {"status": "error", "reason": "no files downloaded"}
+    # Validate files
+    validation_results = validate_files(downloaded)
+    # Run dedup check
+    dedup_report = run_dedup(changed_files, downloaded)
+    # Format and post comment
+    comment = format_comment(pr_num, validation_results, dedup_report)
+    logger.info("Posting validation comment on PR #%d", pr_num)
+    api.comment_discussion(
+        repo_id=DATASET_REPO_ID,
+        discussion_num=pr_num,
+        comment=comment,
+        repo_type="dataset",
+    )
+    return {
+        "status": "ok",
+        "pr": pr_num,
+        "files_checked": len(validation_results),
+        "passed": sum(1 for r in validation_results if r.valid),
+        "failed": sum(1 for r in validation_results if not r.valid),
+    }
+# ---------------------------------------------------------------------------
+# Startup sweep — catch PRs missed while the Space was asleep
+# ---------------------------------------------------------------------------
+def pr_has_validation_comment(pr_num: int) -> bool:
+    """Check if a PR already has an EEE Validation Report comment."""
+    details = api.get_discussion_details(
+        repo_id=DATASET_REPO_ID,
+        discussion_num=pr_num,
+        repo_type="dataset",
+    )
+    for event in details.events:
+        if event.type == "comment" and event.content and event.content.startswith(REPORT_HEADER):
+            return True
+    return False
+def startup_sweep() -> None:
+    """Scan open PRs and validate any that are missing a report."""
+    logger.info("Running startup sweep for unvalidated PRs...")
+    try:
+        discussions = api.get_repo_discussions(
+            repo_id=DATASET_REPO_ID,
+            repo_type="dataset",
+        )
+        for disc in discussions:
+            if not disc.is_pull_request or disc.status != "open":
+                continue
+            if pr_has_validation_comment(disc.num):
+                logger.info("PR #%d already has a validation report, skipping", disc.num)
+                continue
+            logger.info("PR #%d has no validation report, processing", disc.num)
+            try:
+                process_pr(disc.num)
+            except Exception:
+                logger.exception("Startup sweep failed for PR #%d", disc.num)
+    except Exception:
+        logger.exception("Startup sweep failed to list discussions")
+    logger.info("Startup sweep complete")
+# Run sweep in background thread so it doesn't block the webhook server startup
+threading.Thread(target=startup_sweep, daemon=True).start()
+# ---------------------------------------------------------------------------
+# Webhook endpoint
+# ---------------------------------------------------------------------------
+@webhook_endpoint
+async def validate(payload):
+    """Handle incoming webhook events from HuggingFace."""
+    logger.info("Received webhook event: %s", payload.event)
+    # Filter: only dataset PRs, ignore comments
+    if payload.event.scope == "discussion.comment":
+        logger.info("Skipping comment event")
+        return {"status": "skipped", "reason": "comment event"}
+    if payload.repo.type != "dataset":
+        logger.info("Skipping non-dataset event (type=%s)", payload.repo.type)
+        return {"status": "skipped", "reason": "not a dataset repo"}
+    if not payload.discussion or not payload.discussion.isPullRequest:
+        logger.info("Skipping non-PR event")
+        return {"status": "skipped", "reason": "not a pull request"}
+    pr_num = payload.discussion.num
+    try:
+        return process_pr(pr_num)
+    except Exception:
+        logger.exception("Failed to process PR #%d", pr_num)
+        return {"status": "error", "reason": "processing failed"}

dedup.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""Deduplication module for EEE validation pipeline.
+Two-level dedup:
+- Exact duplicates: SHA256 hash of entire file content
+- Near duplicates: SHA256 hash of content minus timestamps/UUIDs
+"""
+import hashlib
+import json
+import logging
+from dataclasses import dataclass, field
+from typing import Any
+from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
+logger = logging.getLogger(__name__)
+DATASET_REPO_ID = "evaleval/EEE_datastore"
+MANIFEST_PATH = "manifest.json"
+# Fields to strip for near-duplicate fingerprinting
+FINGERPRINT_STRIP_FIELDS = {
+    "retrieved_timestamp",
+    "evaluation_id",
+    "evaluation_timestamp",
+}
+def compute_sha256(content: bytes) -> str:
+    return hashlib.sha256(content).hexdigest()
+def _strip_fields(data: dict[str, Any], fields_to_strip: set[str]) -> dict[str, Any]:
+    """Recursively strip specified fields from a dict for fingerprinting."""
+    result = {}
+    for key, value in data.items():
+        if key in fields_to_strip:
+            continue
+        if isinstance(value, dict):
+            result[key] = _strip_fields(value, fields_to_strip)
+        elif isinstance(value, list):
+            result[key] = [
+                _strip_fields(item, fields_to_strip) if isinstance(item, dict) else item
+                for item in value
+            ]
+        else:
+            result[key] = value
+    return result
+def compute_fingerprint(content: bytes) -> str:
+    """Compute a near-duplicate fingerprint by hashing content minus timestamps/UUIDs."""
+    try:
+        data = json.loads(content)
+    except (json.JSONDecodeError, UnicodeDecodeError):
+        # If we can't parse as JSON, fall back to full content hash
+        return compute_sha256(content)
+    stripped = _strip_fields(data, FINGERPRINT_STRIP_FIELDS)
+    # Serialize deterministically
+    canonical = json.dumps(stripped, sort_keys=True, ensure_ascii=True).encode()
+    return hashlib.sha256(canonical).hexdigest()
+@dataclass
+class DedupResult:
+    """Results of deduplication check for a single file."""
+    file_path: str
+    sha256: str
+    fingerprint: str
+    exact_duplicate_of: str | None = None
+    near_duplicate_of: str | None = None
+@dataclass
+class DedupReport:
+    """Aggregated dedup report across all checked files."""
+    results: list[DedupResult] = field(default_factory=list)
+    @property
+    def has_exact_duplicates(self) -> bool:
+        return any(r.exact_duplicate_of is not None for r in self.results)
+    @property
+    def has_near_duplicates(self) -> bool:
+        return any(r.near_duplicate_of is not None for r in self.results)
+def load_manifest(api: HfApi) -> dict[str, Any]:
+    """Download and parse manifest.json from the dataset repo's main branch."""
+    try:
+        manifest_file = hf_hub_download(
+            repo_id=DATASET_REPO_ID,
+            filename=MANIFEST_PATH,
+            repo_type="dataset",
+            revision="main",
+        )
+        with open(manifest_file, "r") as f:
+            return json.load(f)
+    except (EntryNotFoundError, RepositoryNotFoundError):
+        logger.warning("manifest.json not found in %s, using empty manifest", DATASET_REPO_ID)
+        return {"files": {}}
+    except Exception:
+        logger.exception("Failed to load manifest.json")
+        return {"files": {}}
+def check_duplicates(
+    file_paths: list[str],
+    file_contents: dict[str, bytes],
+    manifest: dict[str, Any],
+) -> DedupReport:
+    """Check files against the manifest for exact and near duplicates.
+    Args:
+        file_paths: Repo-relative paths of files to check (e.g. "data/gsm8k/.../abc.json")
+        file_contents: Map of repo-relative path -> raw file bytes
+        manifest: Parsed manifest.json with "files" key
+    """
+    report = DedupReport()
+    manifest_files = manifest.get("files", {})
+    # Build reverse lookups from manifest
+    sha256_to_path: dict[str, str] = {}
+    fingerprint_to_path: dict[str, str] = {}
+    for path, entry in manifest_files.items():
+        sha256_to_path[entry["sha256"]] = path
+        fingerprint_to_path[entry["fingerprint"]] = path
+    for file_path in file_paths:
+        content = file_contents.get(file_path)
+        if content is None:
+            continue
+        sha256 = compute_sha256(content)
+        # Only compute fingerprints for .json files (not .jsonl)
+        if file_path.endswith(".json"):
+            fingerprint = compute_fingerprint(content)
+        else:
+            fingerprint = sha256  # For JSONL, fingerprint == sha256
+        result = DedupResult(
+            file_path=file_path,
+            sha256=sha256,
+            fingerprint=fingerprint,
+        )
+        # Check exact duplicate
+        if sha256 in sha256_to_path and sha256_to_path[sha256] != file_path:
+            result.exact_duplicate_of = sha256_to_path[sha256]
+        # Check near duplicate (only if not already an exact duplicate)
+        if (
+            result.exact_duplicate_of is None
+            and fingerprint in fingerprint_to_path
+            and fingerprint_to_path[fingerprint] != file_path
+        ):
+            result.near_duplicate_of = fingerprint_to_path[fingerprint]
+        report.results.append(result)
+    return report

eval.schema.json ADDED Viewed

	@@ -0,0 +1,653 @@

+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "version": "0.2.0",
+    "type": "object",
+    "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
+    "required": [
+        "schema_version",
+        "evaluation_id",
+        "retrieved_timestamp",
+        "source_metadata",
+        "model_info",
+        "evaluation_results"
+    ],
+    "additionalProperties": false,
+    "properties": {
+        "schema_version": {
+            "type": "string",
+            "description": "Version of the schema used for this evaluation data"
+        },
+        "evaluation_id": {
+            "type": "string",
+            "description": "Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format"
+        },
+        "evaluation_timestamp": {
+            "type": "string",
+            "description": "Timestamp for when the evaluation was run"
+        },
+        "retrieved_timestamp": {
+            "type": "string",
+            "description": "Timestamp for when this record was created - using Unix Epoch time format"
+        },
+        "source_metadata": {
+            "type": "object",
+            "description": "Metadata about the source of the leaderboard data",
+            "required": [
+                "source_type",
+                "source_organization_name",
+                "evaluator_relationship"
+            ],
+            "properties": {
+                "source_name": {
+                    "type": "string",
+                    "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation)."
+                },
+                "source_type": {
+                    "type": "string",
+                    "enum": [
+                        "documentation",
+                        "evaluation_run"
+                    ],
+                    "description": "Whether the data comes from a direct evaluation run or from documentation"
+                },
+                "source_organization_name": {
+                    "type": "string",
+                    "description": "Name of the organization that provides the data"
+                },
+                "source_organization_url": {
+                    "type": "string",
+                    "description": "URL for the organization that provides the data"
+                },
+                "source_organization_logo_url": {
+                    "type": "string",
+                    "description": "URL for the Logo for the organization that provides the data"
+                },
+                "evaluator_relationship": {
+                    "type": "string",
+                    "description": "Relationship between the evaluator and the model",
+                    "enum": [
+                        "first_party",
+                        "third_party",
+                        "collaborative",
+                        "other"
+                    ]
+                }
+            }
+        },
+        "model_info": {
+            "$ref": "#/$defs/model_info"
+        },
+        "evaluation_results": {
+            "type": "array",
+            "description": "Array of evaluation results",
+            "items": {
+                "type": "object",
+                "required": [
+                    "evaluation_name",
+                    "source_data",
+                    "metric_config",
+                    "score_details"
+                ],
+                "properties": {
+                    "evaluation_name": {
+                        "type": "string",
+                        "description": "Name of the evaluation"
+                    },
+                    "source_data": {
+                        "description": "Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.",
+                        "oneOf": [
+                            {
+                                "$ref": "#/$defs/source_data_url"
+                            },
+                            {
+                                "$ref": "#/$defs/source_data_hf"
+                            },
+                            {
+                                "$ref": "#/$defs/source_data_private"
+                            }
+                        ]
+                    },
+                    "evaluation_timestamp": {
+                        "type": "string",
+                        "description": "Timestamp for when the evaluations were run"
+                    },
+                    "metric_config": {
+                        "type": "object",
+                        "description": "Details about the metric",
+                        "required": [
+                            "lower_is_better"
+                        ],
+                        "properties": {
+                            "evaluation_description": {
+                                "type": "string",
+                                "description": "Description of the evaluation"
+                            },
+                            "lower_is_better": {
+                                "type": "boolean",
+                                "description": "Whether a lower score is better"
+                            },
+                            "score_type": {
+                                "type": "string",
+                                "description": "Type of score",
+                                "enum": [
+                                    "binary",
+                                    "continuous",
+                                    "levels"
+                                ]
+                            },
+                            "level_names": {
+                                "type": "array",
+                                "description": "Names of the score levels",
+                                "items": {
+                                    "type": "string"
+                                }
+                            },
+                            "level_metadata": {
+                                "type": "array",
+                                "description": "Additional Description for each Score Level",
+                                "items": {
+                                    "type": "string"
+                                }
+                            },
+                            "has_unknown_level": {
+                                "type": "boolean",
+                                "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
+                            },
+                            "min_score": {
+                                "type": "number",
+                                "description": "Minimum possible score for continuous metric"
+                            },
+                            "max_score": {
+                                "type": "number",
+                                "description": "Maximum possible score for continuous metric"
+                            },
+                            "llm_scoring": {
+                                "type": "object",
+                                "description": "Configuration when LLM is used as scorer/judge",
+                                "additionalProperties": true,
+                                "required": [
+                                    "judges",
+                                    "input_prompt"
+                                ],
+                                "properties": {
+                                    "judges": {
+                                        "type": "array",
+                                        "description": "LLM judge(s) - single item for judge, multiple for jury",
+                                        "items": {
+                                            "$ref": "#/$defs/judge_config"
+                                        },
+                                        "minItems": 1
+                                    },
+                                    "input_prompt": {
+                                        "type": "string",
+                                        "description": "Prompt template used for judging"
+                                    },
+                                    "aggregation_method": {
+                                        "type": "string",
+                                        "enum": [
+                                            "majority_vote",
+                                            "average",
+                                            "weighted_average",
+                                            "median"
+                                        ],
+                                        "description": "How to aggregate scores when multiple judges"
+                                    },
+                                    "expert_baseline": {
+                                        "type": "number",
+                                        "description": "Expert/human baseline score for comparison"
+                                    },
+                                    "additional_details": {
+                                        "$ref": "#/$defs/additional_properties_object"
+                                    }
+                                }
+                            }
+                        },
+                        "if": {
+                            "properties": {
+                                "score_type": {
+                                    "const": "levels"
+                                }
+                            }
+                        },
+                        "then": {
+                            "required": [
+                                "level_names",
+                                "has_unknown_level"
+                            ]
+                        },
+                        "else": {
+                            "if": {
+                                "properties": {
+                                    "score_type": {
+                                        "const": "continuous"
+                                    }
+                                }
+                            },
+                            "then": {
+                                "required": [
+                                    "min_score",
+                                    "max_score"
+                                ]
+                            }
+                        }
+                    },
+                    "score_details": {
+                        "type" : "object",
+                        "description": "The score for the evaluation and related details",
+                        "required": [
+                            "score"
+                        ],
+                        "properties": {
+                            "score": {
+                                "type": "number",
+                                "description": "The score for the evaluation"
+                            },
+                            "details": {
+                                "$ref": "#/$defs/additional_properties_object"
+                            },
+                            "uncertainty": {
+                                "type": "object",
+                                "description": "Quantification of uncertainty around the reported score",
+                                "properties": {
+                                    "standard_error": {
+                                        "type": "object",
+                                        "description": "Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))",
+                                        "properties": {
+                                            "value": {
+                                                "type": "number",
+                                                "description": "The standard error value"
+                                            },
+                                            "method": {
+                                                "type": "string",
+                                                "description": "How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')"
+                                            }
+                                        },
+                                        "required": ["value"]
+                                    },
+                                    "confidence_interval": {
+                                        "type": "object",
+                                        "description": "Lower and upper bounds for the metric at a given confidence level.",
+                                        "properties": {
+                                            "lower": {
+                                                "type": "number",
+                                                "description": "Lower bound of the confidence interval"
+                                            },
+                                            "upper": {
+                                                "type": "number",
+                                                "description": "Upper bound of the confidence interval"
+                                            },
+                                            "confidence_level": {
+                                                "type": "number",
+                                                "description": "Confidence level (e.g. 0.95 for a 95% confidence interval)",
+                                                "minimum": 0,
+                                                "maximum": 1
+                                            },
+                                            "method": {
+                                                "type": "string",
+                                                "description": "How the confidence interval was computed"
+                                            }
+                                        },
+                                        "required": ["lower", "upper"]
+                                    },
+                                    "standard_deviation": {
+                                        "type": "number",
+                                        "description": "Standard deviation of the per-sample scores"
+                                    },
+                                    "num_samples": {
+                                        "type": "integer",
+                                        "description": "Number of samples used to compute the uncertainty estimates"
+                                    },
+                                    "num_bootstrap_samples": {
+                                        "type": "integer",
+                                        "description": "Number of bootstrap resamples used, if bootstrap method was applied"
+                                    }
+                                }
+                            }
+                        }
+                    },
+                    "generation_config": {
+                        "type": "object",
+                        "properties": {
+                            "generation_args": {
+                                "type": "object",
+                                "description": "Parameters used to generate results - properties may vary by model type",
+                                "properties": {
+                                    "temperature": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Sampling temperature"
+                                    },
+                                    "top_p": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Nucleus sampling parameter"
+                                    },
+                                    "top_k": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Top-k sampling parameter"
+                                    },
+                                    "max_tokens": {
+                                        "type": "integer",
+                                        "minimum": 1,
+                                        "description": "Maximum number of tokens to generate"
+                                    },
+                                    "execution_command": {
+                                        "type": "string",
+                                        "description": "Command used to run the model to generate results"
+                                    },
+                                    "reasoning": {
+                                        "type": "boolean",
+                                        "description": "Whether reasoning orchain-of-thought was used to generate results"
+                                    },
+                                    "prompt_template": {
+                                        "type": "string",
+                                        "description": "Input prompt template for task (should contain agentic info if needed)."
+                                    },
+                                    "agentic_eval_config": {
+                                        "type": "object",
+                                        "description": "General configuration for agentic evaluations.",
+                                        "properties": {
+                                            "available_tools": {
+                                                "type": "array",
+                                                "description": "List of all available tools with their configurations",
+                                                "items": {
+                                                    "type": "object",
+                                                    "properties": {
+                                                        "name": {
+                                                            "type": "string",
+                                                            "description": "e.g. bash, calculator, ..."
+                                                        },
+                                                        "description": {
+                                                            "type": "string"
+                                                        },
+                                                        "parameters": {
+                                                            "$ref": "#/$defs/additional_properties_object"
+                                                        }
+                                                    }
+                                                }
+                                            },
+                                            "additional_details": {
+                                                "$ref": "#/$defs/additional_properties_object"
+                                            }
+                                        }
+                                    },
+                                    "eval_plan": {
+                                        "type": "object",
+                                        "description": "Plan (solvers) used in evaluation. Solvers are crucial parts of Inspect evaluations which can serve a wide variety of purposes like providing system prompts, prompt engineering, model generation or multi-turn dialog.",
+                                        "properties": {
+                                            "name": {
+                                                "type": "string"
+                                            },
+                                            "steps": {
+                                                "type": "array",
+                                                "description": "Array of evaluation plan steps",
+                                                "items": {
+                                                    "solver": {
+                                                        "type": "string",
+                                                        "description": "Name of solver e.g. system_message, react."
+                                                    },
+                                                    "parameters": {
+                                                        "$ref": "#/$defs/additional_properties_object"
+                                                    }
+                                                }
+                                            },
+                                            "config": {
+                                                "$ref": "#/$defs/additional_properties_object"
+                                            }
+                                        }
+                                    },
+                                    "eval_limits": {
+                                        "type": "object",
+                                        "description": "Listed evaluation limits like time limit, message limit, token limit.",
+                                        "properties": {
+                                            "time_limit": {
+                                                "type": "integer",
+                                                "description": "Time limit for evaluation."
+                                            },
+                                            "message_limit": {
+                                                "type": "integer",
+                                                "description": "Message limit for evaluation."
+                                            },
+                                            "token_limit": {
+                                                "type": "integer",
+                                                "description": "Token limit for evaluation."
+                                            }
+                                        }
+                                    },
+                                    "sandbox": {
+                                        "type": "object",
+                                        "properties": {
+                                            "type": {
+                                                "type": "string",
+                                                "description": "Type of sandbox e.g. docker"
+                                            },
+                                            "config": {
+                                                "type": "string",
+                                                "description": "Config file name/path e.g. compose.yaml. TODO or full config? Not sure based on the Inspect docs"
+                                            }
+                                        }
+                                    },
+                                    "max_attempts": {
+                                        "type": "integer",
+                                        "description": "Maximum number of submission attempts (default 1).",
+                                        "default": 1
+                                    },
+                                    "incorrect_attempt_feedback": {
+                                        "type": "string",
+                                        "description": "Feedback from the model after incorrect attempt."
+                                    }
+                                },
+                                "additionalProperties": true
+                            },
+                            "additional_details": {
+                                "$ref": "#/$defs/additional_properties_object"
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        "detailed_evaluation_results": {
+            "description": "Reference to the evaluation results for all individual samples in the evaluation",
+            "properties": {
+                "format": {
+                    "type": "string",
+                    "description": "Format of the detailed evaluation results",
+                    "enum": [
+                        "jsonl",
+                        "json"
+                    ]
+                },
+                "file_path": {
+                    "type": "string",
+                    "description": "Path to the detailed evaluation results file"
+                },
+                "hash_algorithm": {
+                    "type": "string",
+                    "description": "Hash algorithm used for checksum and sample_hash in instance-level data",
+                    "enum": [
+                        "sha256",
+                        "md5"
+                    ]
+                },
+                "checksum": {
+                    "type": "string",
+                    "description": "Checksum value of the file"
+                },
+                "total_rows": {
+                    "type": "integer",
+                    "description": "Total number of rows in the detailed evaluation results file"
+                }
+            }
+        }
+    },
+    "$defs": {
+        "additional_properties_object": {
+            "type": "object",
+            "description": "Additional parameters (key-value object)",
+            "additionalProperties": true
+        },
+        "judge_config": {
+            "type": "object",
+            "description": "Configuration for a single LLM judge/juror",
+            "required": [
+                "model_info"
+            ],
+            "properties": {
+                "model_info": {
+                    "$ref": "#/$defs/model_info"
+                },
+                "temperature": {
+                    "type": "number"
+                },
+                "weight": {
+                    "type": "number",
+                    "description": "Weight of this judge's score in aggregation (used in jury)"
+                }
+            }
+        },
+        "model_info": {
+            "type": "object",
+            "description": "Complete model specification including basic information, technical configuration and inference settings",
+            "required": [
+                "name",
+                "id"
+            ],
+            "properties": {
+                "name": {
+                    "type": "string",
+                    "description": "Model name provided by evaluation source"
+                },
+                "id": {
+                    "type": "string",
+                    "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)"
+                },
+                "developer": {
+                    "type": "string",
+                    "description": "Name of organization that provides the model (e.g. 'OpenAI')"
+                },
+                "inference_platform": {
+                    "type": "string",
+                    "description": "Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)"
+                },
+                "inference_engine": {
+                    "type": "object",
+                    "description": "Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama).",
+                    "properties": {
+                        "name": {
+                            "type": "string",
+                            "description": "Name of the inference engine"
+                        },
+                        "version": {
+                            "type": "string",
+                            "description": "Version of the inference engine"
+                        }
+                    }
+                },
+                "additional_details": {
+                    "$ref": "#/$defs/additional_properties_object"
+                }
+            }
+        },
+        "source_data_url": {
+            "type": "object",
+            "description": "URL source for the evaluation data",
+            "required": [
+                "dataset_name",
+                "source_type",
+                "url"
+            ],
+            "additionalProperties": true,
+            "properties": {
+                "dataset_name": {
+                    "type": "string",
+                    "description": "Name of the source dataset"
+                },
+                "source_type": {
+                    "const": "url"
+                },
+                "url": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "minItems": 1,
+                    "description": "URL(s) for the source of the evaluation data"
+                },
+                "additional_details": {
+                    "$ref": "#/$defs/additional_properties_object"
+                }
+            }
+        },
+        "source_data_hf": {
+            "type": "object",
+            "description": "Details about HuggingFace dataset used for evaluation",
+            "required": [
+                "dataset_name",
+                "source_type"
+            ],
+            "additionalProperties": true,
+            "properties": {
+                "dataset_name": {
+                    "type": "string",
+                    "description": "Name of the source dataset"
+                },
+                "source_type": {
+                    "const": "hf_dataset"
+                },
+                "hf_repo": {
+                    "type": "string",
+                    "description": "HuggingFace repository identifier"
+                },
+                "hf_split": {
+                    "type": "string",
+                    "description": "One of train, val or test."
+                },
+                "samples_number": {
+                    "type": "integer",
+                    "description": "Number of samples in the dataset"
+                },
+                "sample_ids": {
+                    "type": "array",
+                    "description": "Array of sample ids used for evaluation",
+                    "items": {
+                        "type": [
+                            "integer",
+                            "string"
+                        ]
+                    }
+                },
+                "additional_details": {
+                    "$ref": "#/$defs/additional_properties_object"
+                }
+            }
+        },
+        "source_data_private": {
+            "type": "object",
+            "description": "Generic source data when neither URL array nor HuggingFace dataset applies",
+            "required": [
+                "dataset_name",
+                "source_type"
+            ],
+            "properties": {
+                "dataset_name": {
+                    "type": "string",
+                    "description": "Name of the source dataset"
+                },
+                "source_type": {
+                    "const": "other"
+                },
+                "additional_details": {
+                    "$ref": "#/$defs/additional_properties_object"
+                }
+            }
+        }
+    }
+}

eval_types.py ADDED Viewed

	@@ -0,0 +1,378 @@

+# generated by datamodel-codegen:
+#   filename:  eval.schema.json
+#   timestamp: 2026-02-11T15:00:05+00:00
+from __future__ import annotations
+from enum import Enum
+from pydantic import BaseModel, ConfigDict, Field, confloat, conint
+from typing import Any, Literal
+class SourceType(Enum):
+    documentation = "documentation"
+    evaluation_run = "evaluation_run"
+class EvaluatorRelationship(Enum):
+    first_party = "first_party"
+    third_party = "third_party"
+    collaborative = "collaborative"
+    other = "other"
+class SourceMetadata(BaseModel):
+    source_name: str | None = Field(
+        None,
+        description="Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation).",
+    )
+    source_type: SourceType = Field(
+        ...,
+        description="Whether the data comes from a direct evaluation run or from documentation",
+    )
+    source_organization_name: str = Field(
+        ..., description="Name of the organization that provides the data"
+    )
+    source_organization_url: str | None = Field(
+        None, description="URL for the organization that provides the data"
+    )
+    source_organization_logo_url: str | None = Field(
+        None, description="URL for the Logo for the organization that provides the data"
+    )
+    evaluator_relationship: EvaluatorRelationship = Field(
+        ..., description="Relationship between the evaluator and the model"
+    )
+class ScoreType(Enum):
+    binary = "binary"
+    continuous = "continuous"
+    levels = "levels"
+class AggregationMethod(Enum):
+    majority_vote = "majority_vote"
+    average = "average"
+    weighted_average = "weighted_average"
+    median = "median"
+class StandardError(BaseModel):
+    value: float = Field(..., description="The standard error value")
+    method: str | None = Field(
+        None,
+        description="How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')",
+    )
+class ConfidenceInterval(BaseModel):
+    lower: float = Field(..., description="Lower bound of the confidence interval")
+    upper: float = Field(..., description="Upper bound of the confidence interval")
+    confidence_level: confloat(ge=0.0, le=1.0) | None = Field(
+        None, description="Confidence level (e.g. 0.95 for a 95% confidence interval)"
+    )
+    method: str | None = Field(
+        None, description="How the confidence interval was computed"
+    )
+class Uncertainty(BaseModel):
+    standard_error: StandardError | None = Field(
+        None,
+        description="Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))",
+    )
+    confidence_interval: ConfidenceInterval | None = Field(
+        None,
+        description="Lower and upper bounds for the metric at a given confidence level.",
+    )
+    standard_deviation: float | None = Field(
+        None, description="Standard deviation of the per-sample scores"
+    )
+    num_samples: int | None = Field(
+        None, description="Number of samples used to compute the uncertainty estimates"
+    )
+    num_bootstrap_samples: int | None = Field(
+        None,
+        description="Number of bootstrap resamples used, if bootstrap method was applied",
+    )
+class EvalLimits(BaseModel):
+    time_limit: int | None = Field(None, description="Time limit for evaluation.")
+    message_limit: int | None = Field(None, description="Message limit for evaluation.")
+    token_limit: int | None = Field(None, description="Token limit for evaluation.")
+class Sandbox(BaseModel):
+    type: str | None = Field(None, description="Type of sandbox e.g. docker")
+    config: str | None = Field(
+        None,
+        description="Config file name/path e.g. compose.yaml. TODO or full config? Not sure based on the Inspect docs",
+    )
+class Format(Enum):
+    jsonl = "jsonl"
+    json = "json"
+class HashAlgorithm(Enum):
+    sha256 = "sha256"
+    md5 = "md5"
+class DetailedEvaluationResults(BaseModel):
+    format: Format | None = Field(
+        None, description="Format of the detailed evaluation results"
+    )
+    file_path: str | None = Field(
+        None, description="Path to the detailed evaluation results file"
+    )
+    hash_algorithm: HashAlgorithm | None = Field(
+        None,
+        description="Hash algorithm used for checksum and sample_hash in instance-level data",
+    )
+    checksum: str | None = Field(None, description="Checksum value of the file")
+    total_rows: int | None = Field(
+        None, description="Total number of rows in the detailed evaluation results file"
+    )
+class AdditionalPropertiesObject(BaseModel):
+    model_config = ConfigDict(
+        extra="allow",
+    )
+class InferenceEngine(BaseModel):
+    name: str | None = Field(None, description="Name of the inference engine")
+    version: str | None = Field(None, description="Version of the inference engine")
+class ModelInfo(BaseModel):
+    name: str = Field(..., description="Model name provided by evaluation source")
+    id: str = Field(
+        ...,
+        description="Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)",
+    )
+    developer: str | None = Field(
+        None, description="Name of organization that provides the model (e.g. 'OpenAI')"
+    )
+    inference_platform: str | None = Field(
+        None,
+        description="Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)",
+    )
+    inference_engine: InferenceEngine | None = Field(
+        None,
+        description="Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama).",
+    )
+    additional_details: AdditionalPropertiesObject | None = None
+class SourceDataUrl(BaseModel):
+    model_config = ConfigDict(
+        extra="allow",
+    )
+    dataset_name: str = Field(..., description="Name of the source dataset")
+    source_type: Literal["url"]
+    url: list[str] = Field(
+        ..., description="URL(s) for the source of the evaluation data", min_length=1
+    )
+    additional_details: AdditionalPropertiesObject | None = None
+class SourceDataHf(BaseModel):
+    model_config = ConfigDict(
+        extra="allow",
+    )
+    dataset_name: str = Field(..., description="Name of the source dataset")
+    source_type: Literal["hf_dataset"]
+    hf_repo: str | None = Field(None, description="HuggingFace repository identifier")
+    hf_split: str | None = Field(None, description="One of train, val or test.")
+    samples_number: int | None = Field(
+        None, description="Number of samples in the dataset"
+    )
+    sample_ids: list[int | str] | None = Field(
+        None, description="Array of sample ids used for evaluation"
+    )
+    additional_details: AdditionalPropertiesObject | None = None
+class SourceDataPrivate(BaseModel):
+    dataset_name: str = Field(..., description="Name of the source dataset")
+    source_type: Literal["other"]
+    additional_details: AdditionalPropertiesObject | None = None
+class ScoreDetails(BaseModel):
+    score: float = Field(..., description="The score for the evaluation")
+    details: AdditionalPropertiesObject | None = None
+    uncertainty: Uncertainty | None = Field(
+        None, description="Quantification of uncertainty around the reported score"
+    )
+class AvailableTool(BaseModel):
+    name: str | None = Field(None, description="e.g. bash, calculator, ...")
+    description: str | None = None
+    parameters: AdditionalPropertiesObject | None = None
+class AgenticEvalConfig(BaseModel):
+    available_tools: list[AvailableTool] | None = Field(
+        None, description="List of all available tools with their configurations"
+    )
+    additional_details: AdditionalPropertiesObject | None = None
+class EvalPlan(BaseModel):
+    name: str | None = None
+    steps: list[Any] | None = Field(None, description="Array of evaluation plan steps")
+    config: AdditionalPropertiesObject | None = None
+class GenerationArgs(BaseModel):
+    model_config = ConfigDict(
+        extra="allow",
+    )
+    temperature: float | None = Field(None, description="Sampling temperature")
+    top_p: float | None = Field(None, description="Nucleus sampling parameter")
+    top_k: float | None = Field(None, description="Top-k sampling parameter")
+    max_tokens: conint(ge=1) | None = Field(
+        None, description="Maximum number of tokens to generate"
+    )
+    execution_command: str | None = Field(
+        None, description="Command used to run the model to generate results"
+    )
+    reasoning: bool | None = Field(
+        None,
+        description="Whether reasoning orchain-of-thought was used to generate results",
+    )
+    prompt_template: str | None = Field(
+        None,
+        description="Input prompt template for task (should contain agentic info if needed).",
+    )
+    agentic_eval_config: AgenticEvalConfig | None = Field(
+        None, description="General configuration for agentic evaluations."
+    )
+    eval_plan: EvalPlan | None = Field(
+        None,
+        description="Plan (solvers) used in evaluation. Solvers are crucial parts of Inspect evaluations which can serve a wide variety of purposes like providing system prompts, prompt engineering, model generation or multi-turn dialog.",
+    )
+    eval_limits: EvalLimits | None = Field(
+        None,
+        description="Listed evaluation limits like time limit, message limit, token limit.",
+    )
+    sandbox: Sandbox | None = None
+    max_attempts: int | None = Field(
+        1, description="Maximum number of submission attempts (default 1)."
+    )
+    incorrect_attempt_feedback: str | None = Field(
+        None, description="Feedback from the model after incorrect attempt."
+    )
+class GenerationConfig(BaseModel):
+    generation_args: GenerationArgs | None = Field(
+        None,
+        description="Parameters used to generate results - properties may vary by model type",
+    )
+    additional_details: AdditionalPropertiesObject | None = None
+class JudgeConfig(BaseModel):
+    model_info: ModelInfo
+    temperature: float | None = None
+    weight: float | None = Field(
+        None, description="Weight of this judge's score in aggregation (used in jury)"
+    )
+class LlmScoring(BaseModel):
+    model_config = ConfigDict(
+        extra="allow",
+    )
+    judges: list[JudgeConfig] = Field(
+        ...,
+        description="LLM judge(s) - single item for judge, multiple for jury",
+        min_length=1,
+    )
+    input_prompt: str = Field(..., description="Prompt template used for judging")
+    aggregation_method: AggregationMethod | None = Field(
+        None, description="How to aggregate scores when multiple judges"
+    )
+    expert_baseline: float | None = Field(
+        None, description="Expert/human baseline score for comparison"
+    )
+    additional_details: AdditionalPropertiesObject | None = None
+class MetricConfig(BaseModel):
+    evaluation_description: str | None = Field(
+        None, description="Description of the evaluation"
+    )
+    lower_is_better: bool = Field(..., description="Whether a lower score is better")
+    score_type: ScoreType | None = Field(None, description="Type of score")
+    level_names: list[str] | None = Field(None, description="Names of the score levels")
+    level_metadata: list[str] | None = Field(
+        None, description="Additional Description for each Score Level"
+    )
+    has_unknown_level: bool | None = Field(
+        None,
+        description="Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown",
+    )
+    min_score: float | None = Field(
+        None, description="Minimum possible score for continuous metric"
+    )
+    max_score: float | None = Field(
+        None, description="Maximum possible score for continuous metric"
+    )
+    llm_scoring: LlmScoring | None = Field(
+        None, description="Configuration when LLM is used as scorer/judge"
+    )
+class EvaluationResult(BaseModel):
+    evaluation_name: str = Field(..., description="Name of the evaluation")
+    source_data: SourceDataUrl | SourceDataHf | SourceDataPrivate = Field(
+        ...,
+        description="Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.",
+    )
+    evaluation_timestamp: str | None = Field(
+        None, description="Timestamp for when the evaluations were run"
+    )
+    metric_config: MetricConfig = Field(..., description="Details about the metric")
+    score_details: ScoreDetails = Field(
+        ..., description="The score for the evaluation and related details"
+    )
+    generation_config: GenerationConfig | None = None
+class EvaluationLog(BaseModel):
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+    schema_version: str = Field(
+        ..., description="Version of the schema used for this evaluation data"
+    )
+    evaluation_id: str = Field(
+        ...,
+        description="Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format",
+    )
+    evaluation_timestamp: str | None = Field(
+        None, description="Timestamp for when the evaluation was run"
+    )
+    retrieved_timestamp: str = Field(
+        ...,
+        description="Timestamp for when this record was created - using Unix Epoch time format",
+    )
+    source_metadata: SourceMetadata = Field(
+        ..., description="Metadata about the source of the leaderboard data"
+    )
+    model_info: ModelInfo
+    evaluation_results: list[EvaluationResult] = Field(
+        ..., description="Array of evaluation results"
+    )
+    detailed_evaluation_results: DetailedEvaluationResults | None = Field(
+        None,
+        description="Reference to the evaluation results for all individual samples in the evaluation",
+    )

instance_level_eval.schema.json ADDED Viewed

	@@ -0,0 +1,329 @@

+{   "$schema": "http://json-schema.org/draft-07/schema#",
+    "version": "instance_level_eval_0.2.0",
+    "type": "object",
+    "description": "Schema for storing instance-level evaluation data for LLM evaluations, supporting single-turn, multi-turn, and agentic interactions",
+    "required": [
+        "schema_version",
+        "evaluation_id",
+        "model_id",
+        "evaluation_name",
+        "sample_id",
+        "interaction_type",
+        "input",
+        "answer_attribution",
+        "evaluation"
+    ],
+    "additionalProperties": true,
+    "properties": {
+        "schema_version": {
+            "type": "string",
+            "description": "Version of the schema used for this instance data"
+        },
+        "evaluation_id": {
+            "type": "string",
+            "description": "Foreign key linking to the aggregate evaluation JSON. Must match the evaluation_id in the aggregate file."
+        },
+        "model_id": {
+            "type": "string",
+            "description": "Identifier of the model in HuggingFace format (e.g. meta-llama/Llama-3.2-1B-Instruct)"
+        },
+        "evaluation_name": {
+            "type": "string",
+            "description": "The specific eval name, ideally unique (e.g. GSM8K, mmlu_physics)"
+        },
+        "sample_id": {
+            "type": ["integer", "string"],
+            "description": "Question/sample identifier from the original dataset (e.g. gsm8k_0001)"
+        },
+        "sample_hash": {
+            "type": "string",
+            "description": "Hash of (input.raw + input.reference) to ensure comparison is between the same sample across models, in case sample_id is not consistent"
+        },
+        "interaction_type": {
+            "type": "string",
+            "enum": ["single_turn", "multi_turn", "agentic"],
+            "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents"
+        },
+        "input": {
+            "type": "object",
+            "description": "Input data for the evaluation sample",
+            "required": ["raw", "reference"],
+            "properties": {
+                "raw": {
+                    "type": "string",
+                    "description": "The raw input as defined in the eval"
+                },
+                "formatted": {
+                    "type": "string",
+                    "description": "Includes chat template, CoT and all relevant modifications - basically the un-tokenized version of what the model sees"
+                },
+                "reference": {
+                    "type": "string",
+                    "description": "Ground truth or reference answer for comparison/scoring"
+                },
+                "choices": {
+                    "type": "array",
+                    "description": "Optional list of choices for multiple-choice questions",
+                    "items": {
+                        "type": "string"
+                    }
+                }
+            }
+        },
+        "output": {
+            "type": ["object", "null"],
+            "description": "Output data - only used for single_turn interactions, null for multi_turn/agentic",
+            "required": ["raw"],
+            "properties": {
+                "raw": {
+                    "type": "string",
+                    "description": "Complete model response"
+                },
+                "reasoning_trace": {
+                    "type": ["string", "null"],
+                    "description": "Reasoning trace of the model if applicable (e.g. chain-of-thought tokens)"
+                }
+            }
+        },
+        "interactions": {
+            "type": ["array", "null"],
+            "description": "List of interactions - used for multi_turn and agentic, null for single_turn",
+            "items": {
+                "type": "object",
+                "required": ["turn_idx", "role"],
+                "properties": {
+                    "turn_idx": {
+                        "type": "integer",
+                        "minimum": 0,
+                        "description": "Index starting from 0 indicating the position in the conversation"
+                    },
+                    "role": {
+                        "type": "string",
+                        "description": "Role of the speaker (e.g. user, assistant, system, tool)"
+                    },
+                    "content": {
+                        "type": ["string", "null"],
+                        "description": "The actual raw text for that particular turn (can be null if empty)"
+                    },
+                    "reasoning_trace": {
+                        "type": ["string", "null"],
+                        "description": "Reasoning trace for that particular turn if applicable"
+                    },
+                    "tool_calls": {
+                        "type": ["array", "null"],
+                        "description": "List of tool invocations for this turn, if applicable",
+                        "items": {
+                            "type": "object",
+                            "required": ["id", "name"],
+                            "properties": {
+                                "id": {
+                                    "type": "string",
+                                    "description": "Unique identifier for the tool call"
+                                },
+                                "name": {
+                                    "type": "string",
+                                    "description": "Name of tool/function"
+                                },
+                                "arguments": {
+                                    "type": "object",
+                                    "description": "Arguments used to call the tool",
+                                    "additionalProperties": true
+                                }
+                            }
+                        }
+                    },
+                    "tool_call_id": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "description": "Reference to the tool call ID this turn is responding to (for tool role responses)"
+                            },
+                            {
+                                "type": "array",
+                                "description": "Reference to the tool call ID(s) this message has the content payload for.",
+                                "items": {
+                                    "type": "string"
+                                }
+                            }
+                        ]
+                    }
+                }
+            }
+        },
+        "answer_attribution": {
+            "type": "array",
+            "description": "Information about how the answer was extracted from the model output",
+            "items": {
+                "type": "object",
+                "required": ["turn_idx", "source", "extracted_value", "extraction_method", "is_terminal"],
+                "properties": {
+                    "turn_idx": {
+                        "type": "integer",
+                        "minimum": 0,
+                        "description": "Turn index in interactions. 0 for single_turn"
+                    },
+                    "source": {
+                        "type": "string",
+                        "description": "Source of the extracted value (e.g. 'output.raw' or 'interactions[turn_idx].content')"
+                    },
+                    "extracted_value": {
+                        "type": "string",
+                        "description": "Value that was extracted"
+                    },
+                    "extraction_method": {
+                        "type": "string",
+                        "description": "Method used to extract the value (e.g. regex, exact_match, llm_judge, custom)"
+                    },
+                    "is_terminal": {
+                        "type": "boolean",
+                        "description": "Whether this is the final answer (false if intermediate outputs are used to build up to a final answer)"
+                    }
+                }
+            }
+        },
+        "evaluation": {
+            "type": "object",
+            "description": "Evaluation results and scoring data",
+            "required": ["score", "is_correct"],
+            "properties": {
+                "score": {
+                    "type": ["number", "boolean"],
+                    "description": "Instance-level score"
+                },
+                "is_correct": {
+                    "type": "boolean",
+                    "description": "Whether the final answer is correct"
+                },
+                "num_turns": {
+                    "type": "integer",
+                    "minimum": 1,
+                    "description": "Number of turns in the interaction"
+                },
+                "tool_calls_count": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "description": "Count of tool calls across all turns in interactions"
+                }
+            }
+        },
+        "token_usage": {
+            "type": ["object", "null"],
+            "description": "Token usage for the model completion",
+            "required": ["input_tokens", "output_tokens", "total_tokens"],
+            "properties": {
+                "input_tokens": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "description": "Total input tokens used"
+                },
+                "output_tokens": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "description": "Total output tokens used"
+                },
+                "total_tokens": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "description": "Total tokens used"
+                },
+                "input_tokens_cache_write": {
+                    "type": ["integer", "null"],
+                    "minimum": 0,
+                    "description": "Number of tokens written to the cache"
+                },
+                "input_tokens_cache_read": {
+                    "type": ["integer", "null"],
+                    "minimum": 0,
+                    "description": "Number of tokens retrieved from the cache"
+                },
+                "reasoning_tokens": {
+                    "type": ["integer", "null"],
+                    "minimum": 0,
+                    "description": "Number of tokens used for reasoning"
+                }
+            }
+        },
+        "performance": {
+            "type": ["object", "null"],
+            "description": "Performance and latency metrics",
+            "properties": {
+                "latency_ms": {
+                    "type": ["number", "null"],
+                    "minimum": 0,
+                    "description": "Total latency in milliseconds"
+                },
+                "time_to_first_token_ms": {
+                    "type": ["number", "null"],
+                    "minimum": 0,
+                    "description": "Time to first token in milliseconds"
+                },
+                "generation_time_ms": {
+                    "type": ["number", "null"],
+                    "minimum": 0,
+                    "description": "Time for generation in milliseconds"
+                }
+            },
+            "additionalProperties": true
+        },
+        "error": {
+            "type": ["string", "null"],
+            "description": "Information about any error that occurred (e.g. timeout, refusal, API error)"
+        },
+        "metadata": {
+            "type": "object",
+            "description": "Optional metadata about the sample (e.g. subject, difficulty, tags)",
+            "additionalProperties": true
+        }
+    },
+    "allOf": [
+        {
+            "if": {
+                "properties": {
+                    "interaction_type": {
+                        "const": "single_turn"
+                    }
+                }
+            },
+            "then": {
+                "required": ["output"],
+                "properties": {
+                    "output": {
+                        "type": "object",
+                        "not": {
+                            "type": "null"
+                        }
+                    },
+                    "interactions": {
+                        "type": "null"
+                    }
+                }
+            }
+        },
+        {
+            "if": {
+                "properties": {
+                    "interaction_type": {
+                        "enum": ["multi_turn", "agentic"]
+                    }
+                }
+            },
+            "then": {
+                "required": ["interactions"],
+                "properties": {
+                    "output": {
+                        "type": "null"
+                    },
+                    "interactions": {
+                        "type": "array",
+                        "not": {
+                            "type": "null"
+                        }
+                    },
+                    "metrics": {
+                        "required": ["num_turns"]
+                    }
+                }
+            }
+        }
+    ]
+}

instance_level_types.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# generated by datamodel-codegen:
+#   filename:  instance_level_eval.schema.json
+#   timestamp: 2026-02-11T15:00:06+00:00
+from __future__ import annotations
+from enum import Enum
+from pydantic import BaseModel, ConfigDict, Field, confloat, conint
+from typing import Any
+class InteractionType(Enum):
+    single_turn = "single_turn"
+    multi_turn = "multi_turn"
+    agentic = "agentic"
+class Input(BaseModel):
+    raw: str = Field(..., description="The raw input as defined in the eval")
+    formatted: str | None = Field(
+        None,
+        description="Includes chat template, CoT and all relevant modifications - basically the un-tokenized version of what the model sees",
+    )
+    reference: str = Field(
+        ..., description="Ground truth or reference answer for comparison/scoring"
+    )
+    choices: list[str] | None = Field(
+        None, description="Optional list of choices for multiple-choice questions"
+    )
+class Output(BaseModel):
+    raw: str = Field(..., description="Complete model response")
+    reasoning_trace: str | None = Field(
+        None,
+        description="Reasoning trace of the model if applicable (e.g. chain-of-thought tokens)",
+    )
+class ToolCall(BaseModel):
+    id: str = Field(..., description="Unique identifier for the tool call")
+    name: str = Field(..., description="Name of tool/function")
+    arguments: dict[str, Any] | None = Field(
+        None, description="Arguments used to call the tool"
+    )
+class Interaction(BaseModel):
+    turn_idx: conint(ge=0) = Field(
+        ...,
+        description="Index starting from 0 indicating the position in the conversation",
+    )
+    role: str = Field(
+        ..., description="Role of the speaker (e.g. user, assistant, system, tool)"
+    )
+    content: str | None = Field(
+        None,
+        description="The actual raw text for that particular turn (can be null if empty)",
+    )
+    reasoning_trace: str | None = Field(
+        None, description="Reasoning trace for that particular turn if applicable"
+    )
+    tool_calls: list[ToolCall] | None = Field(
+        None, description="List of tool invocations for this turn, if applicable"
+    )
+    tool_call_id: str | list[str] | None = None
+class AnswerAttributionItem(BaseModel):
+    turn_idx: conint(ge=0) = Field(
+        ..., description="Turn index in interactions. 0 for single_turn"
+    )
+    source: str = Field(
+        ...,
+        description="Source of the extracted value (e.g. 'output.raw' or 'interactions[turn_idx].content')",
+    )
+    extracted_value: str = Field(..., description="Value that was extracted")
+    extraction_method: str = Field(
+        ...,
+        description="Method used to extract the value (e.g. regex, exact_match, llm_judge, custom)",
+    )
+    is_terminal: bool = Field(
+        ...,
+        description="Whether this is the final answer (false if intermediate outputs are used to build up to a final answer)",
+    )
+class Evaluation(BaseModel):
+    score: float | bool = Field(..., description="Instance-level score")
+    is_correct: bool = Field(..., description="Whether the final answer is correct")
+    num_turns: conint(ge=1) | None = Field(
+        None, description="Number of turns in the interaction"
+    )
+    tool_calls_count: conint(ge=0) | None = Field(
+        None, description="Count of tool calls across all turns in interactions"
+    )
+class TokenUsage(BaseModel):
+    input_tokens: conint(ge=0) = Field(..., description="Total input tokens used")
+    output_tokens: conint(ge=0) = Field(..., description="Total output tokens used")
+    total_tokens: conint(ge=0) = Field(..., description="Total tokens used")
+    input_tokens_cache_write: conint(ge=0) | None = Field(
+        None, description="Number of tokens written to the cache"
+    )
+    input_tokens_cache_read: conint(ge=0) | None = Field(
+        None, description="Number of tokens retrieved from the cache"
+    )
+    reasoning_tokens: conint(ge=0) | None = Field(
+        None, description="Number of tokens used for reasoning"
+    )
+class Performance(BaseModel):
+    model_config = ConfigDict(
+        extra="allow",
+    )
+    latency_ms: confloat(ge=0.0) | None = Field(
+        None, description="Total latency in milliseconds"
+    )
+    time_to_first_token_ms: confloat(ge=0.0) | None = Field(
+        None, description="Time to first token in milliseconds"
+    )
+    generation_time_ms: confloat(ge=0.0) | None = Field(
+        None, description="Time for generation in milliseconds"
+    )
+class InstanceLevelEvaluationLog(BaseModel):
+    model_config = ConfigDict(
+        extra="allow",
+    )
+    schema_version: str = Field(
+        ..., description="Version of the schema used for this instance data"
+    )
+    evaluation_id: str = Field(
+        ...,
+        description="Foreign key linking to the aggregate evaluation JSON. Must match the evaluation_id in the aggregate file.",
+    )
+    model_id: str = Field(
+        ...,
+        description="Identifier of the model in HuggingFace format (e.g. meta-llama/Llama-3.2-1B-Instruct)",
+    )
+    evaluation_name: str = Field(
+        ...,
+        description="The specific eval name, ideally unique (e.g. GSM8K, mmlu_physics)",
+    )
+    sample_id: int | str = Field(
+        ...,
+        description="Question/sample identifier from the original dataset (e.g. gsm8k_0001)",
+    )
+    sample_hash: str | None = Field(
+        None,
+        description="Hash of (input.raw + input.reference) to ensure comparison is between the same sample across models, in case sample_id is not consistent",
+    )
+    interaction_type: InteractionType = Field(
+        ...,
+        description="Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents",
+    )
+    input: Input = Field(..., description="Input data for the evaluation sample")
+    output: Output | None = Field(
+        None,
+        description="Output data - only used for single_turn interactions, null for multi_turn/agentic",
+    )
+    interactions: list[Interaction] | None = Field(
+        None,
+        description="List of interactions - used for multi_turn and agentic, null for single_turn",
+    )
+    answer_attribution: list[AnswerAttributionItem] = Field(
+        ...,
+        description="Information about how the answer was extracted from the model output",
+    )
+    evaluation: Evaluation = Field(
+        ..., description="Evaluation results and scoring data"
+    )
+    token_usage: TokenUsage | None = Field(
+        None, description="Token usage for the model completion"
+    )
+    performance: Performance | None = Field(
+        None, description="Performance and latency metrics"
+    )
+    error: str | None = Field(
+        None,
+        description="Information about any error that occurred (e.g. timeout, refusal, API error)",
+    )
+    metadata: dict[str, Any] | None = Field(
+        None,
+        description="Optional metadata about the sample (e.g. subject, difficulty, tags)",
+    )

pyproject.toml ADDED Viewed

	@@ -0,0 +1,18 @@

+[project]
+name = "eee-validator"
+version = "0.1.0"
+description = "CI validation proxy for the EEE datastore on HuggingFace"
+requires-python = ">=3.12"
+dependencies = [
+    "huggingface-hub>=0.27.0",
+    "pydantic>=2.0",
+    "gradio>=5.0",
+    "jsonschema>=4.0",
+]
+[tool.hatch.build.targets.wheel]
+packages = ["."]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"

requirement_plan.txt ADDED Viewed

	@@ -0,0 +1,44 @@

+Our problem is as follows - we need to move our individual jsons + jsonls to HuggingFace as its just a better structure + we might hit storage limits etc., The process for new data to be submitted would be via drag and drop to file-upload interface of huggingface datasets which is for all purposes just git.
+Aspirationally, the validation + de-duplication workflow is able to:
+(1) Detect changes (i.e. data added only during the PR)
+(2) Runs de-duplication w.r.t the PR and the existing datastore
+(3) Runs the validation only for the added data (one approach here would use git diff) and
+(4) Adds them back to the datastore
+The huggingface dataset -> https://huggingface.co/datasets/evaleval/EEE_datastore (evaleval/EEE_datastore)
+Both this repo and the dataset can be managed by git.
+- repo structure
+data/
+├── {eval_name}/
+│   └── {developer_name}/
+│       └── {model_name}/
+│           └── {uuid}.json
+            └── {uuid}_samples.jsonl
+validate_data.py
+eval_types.py
+instance_level_types.py
+There are typically two types: aggregate information as json and samples with _instances.jsonl (having jsonl in the name) as instance level types that need to be validated.
+The data will be added to by users using the upload functionality which will open a PR.
+First request: build Dockerfile with uv in which the validation will be run, get all dependencies in a requirements.txt that can be uv add -r'd or something.
+Regarding the workflow implement the following:
+(1) Detect changes and pull to space
+- When a user/external collaborator opens a PR in the HF dataset, we wake the space or trigger the space via webhook or better procedure. Following this, we use git diff (or something better if you can recommend) on the huggingface space to find which files have been added (or modified).
+Then download only the stuff that has changed using huggingface_hub api for download certain files from the dataset or some fine-grained git stuff.
+Following this, run validate_py against the schema or use eval_types/instance_eval_types for validation with pydantic whichever you think is more robust and efficient.
+(2) Maintain a manifest containing some form of sha256 hashes, and then compute the new hashes for the whole JSON and the compare to the manifest and if there's a near collision (or 99% or identical) write a .txt or .md (whichever is easier) that flags potential duplicates.
+(3) Write back to the text which has the changes/upload/information (with some sort of unique name always - that eveyrhting was validated or if something failed which files).
+Main thing is I want all of this needs to run in the space (as a proxy for CI) and then update only the necessary data (then clear the space of the rest of the data)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

validate_data.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import argparse
+import json
+import os
+from dataclasses import dataclass, field
+from typing import List
+from jsonschema.exceptions import ValidationError
+from jsonschema.protocols import Validator
+from jsonschema.validators import validator_for
+from pydantic import ValidationError as PydanticValidationError
+from eval_types import EvaluationLog
+from instance_level_types import InstanceLevelEvaluationLog
+@dataclass
+class FileValidationResult:
+    """Result of validating a single file."""
+    file_path: str
+    valid: bool
+    file_type: str  # "json" or "jsonl"
+    errors: list[str] = field(default_factory=list)
+def validate_with_pydantic(file_path: str, file_type: str) -> FileValidationResult:
+    """Validate a file using Pydantic models.
+    Args:
+        file_path: Path to the file on disk.
+        file_type: Either "json" or "jsonl".
+    Returns:
+        FileValidationResult with validation outcome and any errors.
+    """
+    result = FileValidationResult(file_path=file_path, valid=True, file_type=file_type)
+    if file_type == "json":
+        try:
+            with open(file_path, "r") as f:
+                data = json.load(f)
+            EvaluationLog(**data)
+        except json.JSONDecodeError as e:
+            result.valid = False
+            result.errors.append(f"JSON parse error: {e}")
+        except PydanticValidationError as e:
+            result.valid = False
+            for err in e.errors():
+                loc = " -> ".join(str(l) for l in err["loc"])
+                result.errors.append(f"{loc}: {err['msg']}")
+        except Exception as e:
+            result.valid = False
+            result.errors.append(f"{type(e).__name__}: {e}")
+    elif file_type == "jsonl":
+        try:
+            with open(file_path, "r") as f:
+                lines = f.readlines()
+        except Exception as e:
+            result.valid = False
+            result.errors.append(f"File read error: {e}")
+            return result
+        for line_num, line in enumerate(lines, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+                InstanceLevelEvaluationLog(**data)
+            except json.JSONDecodeError as e:
+                result.valid = False
+                result.errors.append(f"Line {line_num}: JSON parse error: {e}")
+            except PydanticValidationError as e:
+                result.valid = False
+                for err in e.errors():
+                    loc = " -> ".join(str(l) for l in err["loc"])
+                    result.errors.append(f"Line {line_num}: {loc}: {err['msg']}")
+            except Exception as e:
+                result.valid = False
+                result.errors.append(f"Line {line_num}: {type(e).__name__}: {e}")
+    else:
+        result.valid = False
+        result.errors.append(f"Unsupported file type: {file_type}")
+    return result
+def get_schema_validator(file_path: str) -> Validator:
+    with open(file_path, "r") as f:
+        schema = json.load(f)
+        validator_cls = validator_for(schema)
+        return validator_cls(schema)
+def validate_file(file_path: str, validator: Validator) -> None:
+    with open(file_path, "r") as f:
+        instance = json.load(f)
+    validator.validate(instance)
+def expand_paths(paths: List[str]) -> List[str]:
+    """Expand folders to file paths"""
+    file_paths: List[str] = []
+    for path in paths:
+        if os.path.isfile(path) and path.endswith(".json"):
+            file_paths.append(path)
+        elif os.path.isdir(path):
+            for root, _, file_names in os.walk(path):
+                for file_name in file_names:
+                    if file_name.endswith(".json"):
+                        file_paths.append(os.path.join(root, file_name))
+        else:
+            raise Exception(f"Could not find file or directory at path: {path}")
+    return file_paths
+def annotate_error(file_path: str, message: str, **kwargs) -> None:
+    """If run in GitHub Actions, annotate errors"""
+    if os.environ.get("GITHUB_ACTION"):
+        joined_kwargs = "".join(f",{key}={value}" for key, value in kwargs.items())
+        print(f"::error file={file_path}{joined_kwargs}::{message}")
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        prog="validate_data",
+        description="Validates that the JSON data conforms to the JSON schema",
+    )
+    parser.add_argument(
+        "paths", nargs="+", type=str, help="File or folder paths to the JSON data"
+    )
+    parser.add_argument(
+        "-s",
+        "--schema-path",
+        type=str,
+        help="File path to the JSON schema",
+        required=True,
+    )
+    args = parser.parse_args()
+    file_paths = expand_paths(args.paths)
+    num_passed = 0
+    num_failed = 0
+    validator = get_schema_validator(args.schema_path)
+    print()
+    print(f"Validating {len(file_paths)} JSON files...")
+    print()
+    for file_path in file_paths:
+        try:
+            validate_file(file_path, validator)
+            num_passed += 1
+        except ValidationError as e:
+            message = f"{type(e).__name__}: {e.message}"
+            annotate_error(
+                file_path, f"{type(e).__name__}: {e.message}", title=type(e).__name__
+            )
+            print(f"{file_path}")
+            print("  " + message)
+            print()
+            num_failed += 1
+        except json.JSONDecodeError as e:
+            # e.colno
+            message = f"{type(e).__name__}: {str(e)}"
+            annotate_error(
+                file_path,
+                f"{type(e).__name__}: {str(e)}",
+                title=type(e).__name__,
+                col=e.colno,
+                line=e.lineno,
+            )
+            print(f"{file_path}")
+            print("  " + message)
+            print()
+            num_failed += 1
+        except Exception as e:
+            message = f"{type(e).__name__}: {str(e)}"
+            annotate_error(
+                file_path, f"{type(e).__name__}: {str(e)}", title=type(e).__name__
+            )
+            print(f"{file_path}")
+            print("  " + message)
+            print()
+            raise
+    print(f"{num_passed} file(s) passed; {num_failed} file(s) failed")
+    print()
+    if num_failed > 0:
+        exit(1)
+if __name__ == "__main__":
+    main()